From aaaed1307765910c66a46361023e357dd3ad6021 Mon Sep 17 00:00:00 2001 From: Erik Arvstedt Date: Tue, 18 Dec 2018 19:02:13 +0100 Subject: [PATCH] tesseract: add a wrapper to setup languages Tesseract is now decoupled from the tessdata language corpus. This avoids recompilation when building Tesseract with a custom set of languages. Update k2pdfopt to use the new wrapper interface. --- .../graphics/tesseract/default.nix | 15 ++-- .../graphics/tesseract/languages.nix | 43 ++++++++++++ .../graphics/tesseract/tesseract3.nix | 44 +----------- .../graphics/tesseract/tesseract4.nix | 42 ++--------- .../graphics/tesseract/wrapper.nix | 69 +++++++++++++++++++ pkgs/applications/misc/k2pdfopt/default.nix | 28 ++++---- 6 files changed, 145 insertions(+), 96 deletions(-) create mode 100644 pkgs/applications/graphics/tesseract/languages.nix create mode 100644 pkgs/applications/graphics/tesseract/wrapper.nix diff --git a/pkgs/applications/graphics/tesseract/default.nix b/pkgs/applications/graphics/tesseract/default.nix index 6da6931eea38..969402bf7c59 100644 --- a/pkgs/applications/graphics/tesseract/default.nix +++ b/pkgs/applications/graphics/tesseract/default.nix @@ -1,11 +1,18 @@ { callPackage, lowPrio }: let - tesseract3 = callPackage ./tesseract3.nix {}; - tesseract4 = callPackage ./tesseract4.nix {}; + base3 = callPackage ./tesseract3.nix {}; + base4 = callPackage ./tesseract4.nix {}; + languages = callPackage ./languages.nix {}; in { - tesseract = tesseract3; + tesseract = callPackage ./wrapper.nix { + tesseractBase = base3; + languages = languages.v3; + }; - tesseract_4 = lowPrio tesseract4; + tesseract_4 = lowPrio (callPackage ./wrapper.nix { + tesseractBase = base4; + languages = languages.v4; + }); } diff --git a/pkgs/applications/graphics/tesseract/languages.nix b/pkgs/applications/graphics/tesseract/languages.nix new file mode 100644 index 000000000000..d06793ac4171 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/languages.nix @@ -0,0 +1,43 @@ +{ stdenv, lib, fetchurl, fetchFromGitHub }: + +rec { + makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }: + let + tessdataSrc = fetchFromGitHub { + owner = "tesseract-ocr"; + repo = "tessdata"; + rev = tessdataRev; + sha256 = tessdata; + }; + + languageFile = lang: sha256: fetchurl { + url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata"; + inherit sha256; + }; + in + { + all = stdenv.mkDerivation { + name = "all"; + buildCommand = '' + mkdir $out + cd ${tessdataSrc} + cp *.traineddata $out + ''; + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + outputHash = all; + }; + }; + + v3 = makeLanguages { + tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; + tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; + all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn"; + }; + + v4 = makeLanguages { + tessdataRev = "4.0.0"; + tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28"; + all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg"; + }; +} diff --git a/pkgs/applications/graphics/tesseract/tesseract3.nix b/pkgs/applications/graphics/tesseract/tesseract3.nix index 7940079d0994..db0e06434aa9 100644 --- a/pkgs/applications/graphics/tesseract/tesseract3.nix +++ b/pkgs/applications/graphics/tesseract/tesseract3.nix @@ -1,37 +1,5 @@ -{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig -, leptonica, libpng, libtiff, icu, pango, opencl-headers -# Supported list of languages or `null' for all available languages -, enableLanguages ? null -# if you want just a specific list of languages, optionally specify a hash -# to make tessdata a fixed output derivation. -, enableLanguagesHash ? (if enableLanguages == null # all languages - then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw" - else null) -}: - -let tessdata = stdenv.mkDerivation ({ - name = "tessdata"; - src = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tessdata"; - rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; - # when updating don't forget to update the default value fo enableLanguagesHash - sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; - }; - buildCommand = '' - cd $src; - for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do - install -Dt $out/share/tessdata $src/$lang ; - done; - ''; - preferLocalBuild = true; - } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) { - # when a hash is given, we make this a fixed output derivation. - outputHashMode = "recursive"; - outputHashAlgo = "sha256"; - outputHash = enableLanguagesHash; - })); -in +{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig +, leptonica, libpng, libtiff, icu, pango, opencl-headers }: stdenv.mkDerivation rec { name = "tesseract-${version}"; @@ -51,17 +19,11 @@ stdenv.mkDerivation rec { LIBLEPT_HEADERSDIR = "${leptonica}/include"; - postInstall = '' - for i in ${tessdata}/share/tessdata/*; do - ln -s $i $out/share/tessdata; - done - ''; - meta = { description = "OCR engine"; homepage = https://github.com/tesseract-ocr/tesseract; license = stdenv.lib.licenses.asl20; - maintainers = with stdenv.lib.maintainers; [viric]; + maintainers = with stdenv.lib.maintainers; [ viric earvstedt ]; platforms = with stdenv.lib.platforms; linux ++ darwin; }; } diff --git a/pkgs/applications/graphics/tesseract/tesseract4.nix b/pkgs/applications/graphics/tesseract/tesseract4.nix index 2ebca09b831f..df321023c741 100644 --- a/pkgs/applications/graphics/tesseract/tesseract4.nix +++ b/pkgs/applications/graphics/tesseract/tesseract4.nix @@ -1,9 +1,5 @@ { stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig -, leptonica, libpng, libtiff, icu, pango, opencl-headers - -# Supported list of languages or `null' for all available languages -, enableLanguages ? null -}: +, leptonica, libpng, libtiff, icu, pango, opencl-headers }: stdenv.mkDerivation rec { name = "tesseract-${version}"; @@ -16,46 +12,16 @@ stdenv.mkDerivation rec { sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8"; }; - tessdata = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tessdata"; - rev = version; - sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28"; - }; + enableParallelBuilding = true; nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ]; buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; - # Copy the .traineddata files of the languages specified in enableLanguages - # into `$out/share/tessdata' and check afterwards if copying was successful. - postInstall = let - mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}"; - mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg; - findLangArgs = if enableLanguages != null - then "\\( ${mkFindArgs enableLanguages} \\)" - else "-iname '*.traineddata'"; - in '' - numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \ - ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)" - - ${if enableLanguages != null then '' - expected=${toString (builtins.length enableLanguages)} - '' else '' - expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)" - ''} - - if [ "$numLangs" -ne "$expected" ]; then - echo "Expected $expected languages, but $numLangs" \ - "were copied to \`$out/share/tessdata'" >&2 - exit 1 - fi - ''; - meta = { description = "OCR engine"; homepage = https://github.com/tesseract-ocr/tesseract; license = stdenv.lib.licenses.asl20; - maintainers = with stdenv.lib.maintainers; [viric]; - platforms = with stdenv.lib.platforms; linux; + maintainers = with stdenv.lib.maintainers; [ viric earvstedt ]; + platforms = with stdenv.lib.platforms; linux ++ darwin; }; } diff --git a/pkgs/applications/graphics/tesseract/wrapper.nix b/pkgs/applications/graphics/tesseract/wrapper.nix new file mode 100644 index 000000000000..6645cd9bfe94 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/wrapper.nix @@ -0,0 +1,69 @@ +{ stdenv, makeWrapper, tesseractBase, languages + +# A list of languages like [ "eng" "spa" … ] or `null` for all available languages +, enableLanguages ? null + +# A list of files or a directory containing files +, tessdata ? ( + if enableLanguages == null then + languages.all + else stdenv.mkDerivation ({ + name = "tessdata"; + buildCommand = '' + for lang in ${stdenv.lib.concatMapStringsSep " " (x: x + ".traineddata") enableLanguages}; do + install -Dt $out ${languages.all}/$lang + done + ''; + preferLocalBuild = true; + } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) { + # when a hash is given, we make this a fixed output derivation. + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + outputHash = enableLanguagesHash; + })) + ) + +, enableLanguagesHash ? null +}: + +let + passthru = { inherit tesseractBase languages tessdata; }; + + tesseractWithData = tesseractBase.overrideAttrs (_: { + inherit tesseractBase tessdata; + + buildInputs = [ makeWrapper ]; + + buildCommand = '' + makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata + + # Recursively link include, share + cp -rs --no-preserve=mode $tesseractBase/{include,share} $out + + cp -r --no-preserve=mode $tesseractBase/lib $out + # Fixup the store paths in lib so that the tessdata from this derivation is used. + if (( ''${#tesseractBase} != ''${#out} )); then + echo "Can't replace store paths due to differing lengths" + exit 1 + fi + find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \; + + if [[ -d "$tessdata" ]]; then + ln -s $tessdata/* $out/share/tessdata + else + for lang in $tessdata; do + ln -s $lang $out/share/tessdata/''${lang#/nix/store*-} + done + fi + + if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then + # This is a bug in Tesseract's internal tessdata discovery mechanism + echo "eng.traineddata must be present in tessdata for Tesseract to work" + exit 1 + fi + ''; + }); + + tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru; +in + tesseract diff --git a/pkgs/applications/misc/k2pdfopt/default.nix b/pkgs/applications/misc/k2pdfopt/default.nix index 0e84283a9ef7..8f69abd3a987 100644 --- a/pkgs/applications/misc/k2pdfopt/default.nix +++ b/pkgs/applications/misc/k2pdfopt/default.nix @@ -75,19 +75,21 @@ stdenv.mkDerivation rec { cp ${src}/leptonica_mod/* src/ ''; }); - tesseract_modded = tesseract.overrideAttrs (attrs: { - prePatch = '' - cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/ - cp ${src}/tesseract_mod/dawg.cpp api/ - cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/ - cp ${src}/tesseract_mod/openclwrapper.h opencl/ - cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/ - cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/ - cp ${src}/tesseract_mod/tesscapi.cpp api/ - cp ${src}/include_mod/{tesseract.h,leptonica.h} api/ - ''; - patches = [ ./tesseract.patch ]; - }); + tesseract_modded = tesseract.override { + tesseractBase = tesseract.tesseractBase.overrideAttrs (_: { + prePatch = '' + cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/ + cp ${src}/tesseract_mod/dawg.cpp api/ + cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/ + cp ${src}/tesseract_mod/openclwrapper.h opencl/ + cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/ + cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/ + cp ${src}/tesseract_mod/tesscapi.cpp api/ + cp ${src}/include_mod/{tesseract.h,leptonica.h} api/ + ''; + patches = [ ./tesseract.patch ]; + }); + }; in [ zlib libpng ] ++ optional enableGSL gsl ++