1
0
Fork 1
mirror of https://github.com/NixOS/nixpkgs.git synced 2024-09-11 15:08:33 +01:00
nixpkgs/pkgs/applications/graphics/tesseract/default.nix
aszlig c381fa9b63
tesseract: 3.04.01 -> 3.05.00
Upstream changelog:

 * Made some fine tuning to the hOCR output.
 * Added TSV as another optional output format.
 * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout()
   method.
 * text2image tool - Enable all OpenType ligatures available in a font.
   This feature requires Pango 1.38 or newer.
 * Training tools - Replaced asserts with tprintf() and exit(1).
 * Fixed Cygwin compatibility.
 * Improved multipage tiff processing.
 * Improved the embedded pdf font (pdf.ttf).
 * Enable selection of OCR engine mode from command line.
 * Changed tesseract command line parameter '-psm' to '--psm'.
 * Added new C API for orientation and script detection, removed the old
   one.
 * Increased minimum autoconf version to 2.59.
 * Removed dead code.
 * Fixed many compiler warning.
 * Fixed memory and resource leaks.
 * Fixed some issues with the 'Cube' OCR engine.
 * Fixed some openCL issues.
 * Added option to build Tesseract with CMake build system.
 * Implemented CPPAN support for easy Windows building.

The upstream URL of the change log is:

https://github.com/tesseract-ocr/tesseract/releases/tag/3.05.00

Tested by building against the following packages that directly depend
on it:

 * vapoursynth (with ocrSupport = true)
 * pyocr (fails)
 * vobsub2srt

Also tested against the following NixOS VM tests that have OCR enabled:

 * nixos/tests/chromium.nix -A stable
 * nixos/tests/emacs-daemon.nix
 * nixos/tests/installer.nix -A luksroot
 * nixos/tests/lightdm.nix
 * nixos/tests/plasma5.nix
 * nixos/tests/sddm.nix

All of the packages and tests except pyocr build/succeed on
x86_64-linux.

Fixing pyocr is outside of the scope of this commit and will happen very
soon.

Signed-off-by: aszlig <aszlig@redmoonstudios.org>
2017-04-11 03:21:32 +02:00

64 lines
2.1 KiB
Nix

{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
}:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
version = "3.05.00";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
};
tessdata = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
};
nativeBuildInputs = [ pkgconfig autoreconfHook ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
LIBLEPT_HEADERSDIR = "${leptonica}/include";
# Copy the .traineddata files of the languages specified in enableLanguages
# into `$out/share/tessdata' and check afterwards if copying was successful.
postInstall = let
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
findLangArgs = if enableLanguages != null
then "\\( ${mkFindArgs enableLanguages} \\)"
else "-iname '*.traineddata'";
in ''
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
${if enableLanguages != null then ''
expected=${toString (builtins.length enableLanguages)}
'' else ''
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
''}
if [ "$numLangs" -ne "$expected" ]; then
echo "Expected $expected languages, but $numLangs" \
"were copied to \`$out/share/tessdata'" >&2
exit 1
fi
'';
meta = {
description = "OCR engine";
homepage = http://code.google.com/p/tesseract-ocr/;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
platforms = with stdenv.lib.platforms; linux;
};
}