diff --git a/nixos/lib/testing.nix b/nixos/lib/testing.nix index 57b4412d9bb7..c0b4041d7e30 100644 --- a/nixos/lib/testing.nix +++ b/nixos/lib/testing.nix @@ -116,7 +116,7 @@ in rec { vms = map (m: m.config.system.build.vm) (lib.attrValues nodes); - ocrProg = tesseract_4.override { enableLanguages = [ "eng" ]; }; + ocrProg = tesseract4.override { enableLanguages = [ "eng" ]; }; imagemagick_tiff = imagemagick_light.override { inherit libtiff; }; diff --git a/pkgs/applications/graphics/tesseract/4.x.nix b/pkgs/applications/graphics/tesseract/4.x.nix deleted file mode 100644 index 2ebca09b831f..000000000000 --- a/pkgs/applications/graphics/tesseract/4.x.nix +++ /dev/null @@ -1,61 +0,0 @@ -{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig -, leptonica, libpng, libtiff, icu, pango, opencl-headers - -# Supported list of languages or `null' for all available languages -, enableLanguages ? null -}: - -stdenv.mkDerivation rec { - name = "tesseract-${version}"; - version = "4.0.0"; - - src = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tesseract"; - rev = version; - sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8"; - }; - - tessdata = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tessdata"; - rev = version; - sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28"; - }; - - nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ]; - buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; - - # Copy the .traineddata files of the languages specified in enableLanguages - # into `$out/share/tessdata' and check afterwards if copying was successful. - postInstall = let - mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}"; - mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg; - findLangArgs = if enableLanguages != null - then "\\( ${mkFindArgs enableLanguages} \\)" - else "-iname '*.traineddata'"; - in '' - numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \ - ${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)" - - ${if enableLanguages != null then '' - expected=${toString (builtins.length enableLanguages)} - '' else '' - expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)" - ''} - - if [ "$numLangs" -ne "$expected" ]; then - echo "Expected $expected languages, but $numLangs" \ - "were copied to \`$out/share/tessdata'" >&2 - exit 1 - fi - ''; - - meta = { - description = "OCR engine"; - homepage = https://github.com/tesseract-ocr/tesseract; - license = stdenv.lib.licenses.asl20; - maintainers = with stdenv.lib.maintainers; [viric]; - platforms = with stdenv.lib.platforms; linux; - }; -} diff --git a/pkgs/applications/graphics/tesseract/default.nix b/pkgs/applications/graphics/tesseract/default.nix index 7940079d0994..840c87de216b 100644 --- a/pkgs/applications/graphics/tesseract/default.nix +++ b/pkgs/applications/graphics/tesseract/default.nix @@ -1,67 +1,18 @@ -{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig -, leptonica, libpng, libtiff, icu, pango, opencl-headers -# Supported list of languages or `null' for all available languages -, enableLanguages ? null -# if you want just a specific list of languages, optionally specify a hash -# to make tessdata a fixed output derivation. -, enableLanguagesHash ? (if enableLanguages == null # all languages - then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw" - else null) -}: +{ callPackage, lowPrio }: -let tessdata = stdenv.mkDerivation ({ - name = "tessdata"; - src = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tessdata"; - rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; - # when updating don't forget to update the default value fo enableLanguagesHash - sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; - }; - buildCommand = '' - cd $src; - for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do - install -Dt $out/share/tessdata $src/$lang ; - done; - ''; - preferLocalBuild = true; - } // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) { - # when a hash is given, we make this a fixed output derivation. - outputHashMode = "recursive"; - outputHashAlgo = "sha256"; - outputHash = enableLanguagesHash; - })); +let + base3 = callPackage ./tesseract3.nix {}; + base4 = callPackage ./tesseract4.nix {}; + languages = callPackage ./languages.nix {}; in - -stdenv.mkDerivation rec { - name = "tesseract-${version}"; - version = "3.05.00"; - - src = fetchFromGitHub { - owner = "tesseract-ocr"; - repo = "tesseract"; - rev = version; - sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30"; +{ + tesseract3 = callPackage ./wrapper.nix { + tesseractBase = base3; + languages = languages.v3; }; - enableParallelBuilding = true; - - nativeBuildInputs = [ pkgconfig autoreconfHook ]; - buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; - - LIBLEPT_HEADERSDIR = "${leptonica}/include"; - - postInstall = '' - for i in ${tessdata}/share/tessdata/*; do - ln -s $i $out/share/tessdata; - done - ''; - - meta = { - description = "OCR engine"; - homepage = https://github.com/tesseract-ocr/tesseract; - license = stdenv.lib.licenses.asl20; - maintainers = with stdenv.lib.maintainers; [viric]; - platforms = with stdenv.lib.platforms; linux ++ darwin; - }; + tesseract4 = lowPrio (callPackage ./wrapper.nix { + tesseractBase = base4; + languages = languages.v4; + }); } diff --git a/pkgs/applications/graphics/tesseract/fetch-language-hashes b/pkgs/applications/graphics/tesseract/fetch-language-hashes new file mode 100755 index 000000000000..c431f1d97c26 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/fetch-language-hashes @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Usage: +# ./fetch-language-hashes […] +# +# Fetches all languages if no language codes are given. +# +# Example: +# ./fetch-language-hashes 4.0.0 eng spa +# +# Output: +# eng = "0iy0..."; +# spa = "15kw..."; + +set -e + +(( $# >= 1 )) || exit 1 +tessdataRev=$1 +shift + +if (( $# > 0 )); then + langCodes="$@" +else + repoPage=$(curl -fs https://github.com/tesseract-ocr/tessdata/tree/$tessdataRev || { + >&2 echo "Invalid tessdataRev: $tessdataRev" + exit 1 + }) + langCodes=$(echo $(echo "$repoPage" | grep -ohP "(?<=/)[^/]+?(?=\.traineddata)" | sort)) +fi + +for lang in $langCodes; do + url=https://github.com/tesseract-ocr/tessdata/raw/$tessdataRev/$lang.traineddata + hash=$(nix-prefetch-url $url 2>/dev/null) + echo "$lang = \"$hash\";" +done diff --git a/pkgs/applications/graphics/tesseract/languages.nix b/pkgs/applications/graphics/tesseract/languages.nix new file mode 100644 index 000000000000..08512a5cdd9d --- /dev/null +++ b/pkgs/applications/graphics/tesseract/languages.nix @@ -0,0 +1,289 @@ +{ stdenv, lib, fetchurl, fetchFromGitHub }: + +rec { + makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }: + let + tessdataSrc = fetchFromGitHub { + owner = "tesseract-ocr"; + repo = "tessdata"; + rev = tessdataRev; + sha256 = tessdata; + }; + + languageFile = lang: sha256: fetchurl { + url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata"; + inherit sha256; + }; + in + { + # Use a simple fixed-output derivation for all languages to increase nix eval performance + all = stdenv.mkDerivation { + name = "all"; + buildCommand = '' + mkdir $out + cd ${tessdataSrc} + cp *.traineddata $out + ''; + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + outputHash = all; + }; + } // (lib.mapAttrs languageFile languages); + + v3 = makeLanguages { + tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d"; + tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7"; + all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn"; + + # Run `./fetch-language-hashes ` to generate these hashes + languages = { + afr = "15dsnzy4i9ai26ilm73gkfj4ck039raa88i6w443c4b1fnay2akf"; + amh = "1wbcsdq3svxga3j1alk61xs72a9fhsfsyjxhp3cwxfaqfhrzg7h4"; + ara = "0nk495gki6jbbnwcl2ybsx4nd02d6qykcjncq0d2g8pbgapqmj91"; + asm = "0c3wq15yphq7x74s2sn3f90k6z1cf5j7ic62z0dynidrv99bddfh"; + aze = "0pz073hxqkx1a1cshlgg5k11lj73s52sdxa7k3020drc314lhaxw"; + aze_cyrl = "0djbfgx28ykcjsn2p0766qrmj256g7vhc7valc3ivsva8b906lxq"; + bel = "04zqy8vik0fcakq6apfp8wjhkkhlg0yn9kmag1lk7s8fy9ax3ws2"; + ben = "0q7812kn5xjm47hcgdcg911lhbgqr7hbvqckfxxm8qw0yjx2cy0m"; + bod = "0rwq7539zzfs8xs0bf1535z1cwkm0yk1ni25f5gjav7nm6qpiaan"; + bos = "1qr04dj7lx347gxpin5nfprbggmxq2mwx8kf3pcc3vb5x3pa57g4"; + bul = "0cyyqgi3i4y9bfzwls0lwljzgd0r8ayfqb4bbvdh4qmbni9x42ya"; + cat = "0kgw8f5pdw9lfbn6cfp5n1s0j8pj3418yx6rsbagzcf1gr36gbr9"; + ceb = "1g1n4np4vhar7wfwx2km5k6kldb600rrl7npfbf75229rar068f1"; + ces = "0zxkkyhpd74i6321nv86pkjb0k7p9cp6m174rbn42nl7jz6qxib0"; + chi_sim = "0k250xr0gk9yh22yqxd0zpxdsrqfzs164kdv5n9rxx1g996yffij"; + chi_tra = "03nxqpd546p0gwfj6pqzbdbv5zjpdddzlpa10xn4nvmks1mmckbp"; + chr = "1k1sg3hap0kd5aa36ysvmhp7r3fynxf0f7lzz814h6p3g250zclb"; + cym = "0d6wbf9cmrrzf66mhcckwdfy3xh2i38r0by9nk6isw9rl7bf7j07"; + dan = "1s1yj56rpzmif3ir3qs4iab744cgpflk7y8812z2665bh61illpr"; + dan_frak = "1bxi53ymib5g0139vfd2pflh7nl5925vqznq3sfgaqx7gdx630vi"; + deu = "0fna7fqk1a8ivd7q2k38vx37qm3vbn183zh4z5zfqb4pgqmb8znb"; + deu_frak = "1y4krkvarg7jxhcq49fgybg4phbn58y9c0z2bm8mnp28jkih1cnb"; + dzo = "1fcz0imi7zxi99762pxfcm5iz2jcbqj3s742magka4ihrxnz07xm"; + ell = "0r0f71jy4y29bg055qvvy93wchi3lh08zz0k9c8l7466b03yvq5v"; + eng = "0vghah8kqcv0n5fnjb88w6siz156ysrc41fckw3f2y8c3sgmqlf0"; + enm = "10y61xv3w1ypgqz5rgb22y5hh1i4zx03cwiqw21ifqvg4xdrln46"; + epo = "1y5lh55mbcx33cm7qlf1dcah8ffycxmlcpzjzx9r6ij14fdd4964"; + equ = "1nqrd0a9jqqh6byy8snfhad1hisrc92dcx44wsy7v4nf40j3mx1s"; + est = "12ll8lq1hjcsq9hh93020w78r7f1rcxcwlvrjqw8j5p3k9jg5a4g"; + eus = "034s9mp7lw1a4yvf2cmbbj2fbqbaq6xnjqh30yn0wq0c0jck96nw"; + fas = "0m61p4byc0kzf75cdn6g18s8hcg9r8ifs34wr85lbsb65kil4ijx"; + fin = "1wac333k0lcd5jwprzg99b10bq8sdc96b9d6275kg9imyqjwcc7q"; + fra = "1ax7i0nw1lwkz4sbrvn4z0lcrcai77ymdpla7qk7yij6s4xb5bw6"; + frk = "16nmr71p93724vk1x5mq4r8vxpwnm448p6dwqv8scg8asch1cidp"; + frm = "00yz3hz7wcralq8wbx1ap4c6b37ac6vnz5bgmxmgdx0kqzibiddn"; + gle = "1n8z8kmn5m628rlzgz5v0iw6h46aalflq5asa1wj5rygx1y2azpa"; + glg = "0fdniayplc3iwmlmvhblarh1gm97dp8rqhhkb8b0clwfd9cj342z"; + grc = "04r2193qcxqyab5998xn8bf7197wiccmjm7iakij8d0c7l61dnxb"; + guj = "0dp8mlxmf0x9wb8dg0c508sdwz03icq94z8ji8jhwgdqgv8hw1al"; + hat = "0793mmlxbb09c8103jhdvlczz647nyn4ykkgd3gwgavncmjh72v8"; + heb = "16za9ff1i3ya6hz75l9v3v7j4039kscxxw21g3i2w5p9zn52hyag"; + hin = "1vnn5wpc724kgib8jbx0kpnnp4al60ivqir72gnbyh6cpnflb6bf"; + hrv = "15rqd6xiv2bdmalb5s6rxvw0yk6w9agn9fli3bvi703q6vpj2yn3"; + hun = "19zzwdxwi3h3vdsgr271i1m87gfpdirk6b1ljw2j8qmfilp4sw56"; + iku = "1v1yvc1194qycjgb4ihh5hpj6472nlbp66dii183514g2dh9x0db"; + ind = "120d4b41wvsgcd1sgy2mp78i9hvi7w03a63078dz1yds0yqdwf1p"; + isl = "003ngk8dfv6dglkq8pmi6jsglrfkc65js5ywh3vvkg7qfqf6qsxz"; + ita = "1lxklk3zc3x3k8yfpp6ygyv7fndgs57dfasc97rh8782ds16wkjs"; + ita_old = "188gby1y51pa1ycyc8y17d16hs5w27yl5ch7xzni98bdjkwbkl1z"; + jav = "1fjyjznjchls5ifbnx2b9xagisgxvgj9lsf39rr9d87sbzdbbwbp"; + jpn = "1wmayj8wh3pfwznjhalad2qzv38mhrzw2sxl71mycvzvpdy9ag1w"; + kan = "0hak4953whw9vd9dzl0hq076kzb19kk45kmfxk03af4k6gb206vg"; + kat = "16k0057cvvdc6snm5svhdv3cr7cw71g74yy8215njjbsi838imi3"; + kat_old = "02gl755d38plyvzwfjqxvjgfqkbjs9rvzx33qfhm2zvmgbwrfrfh"; + kaz = "0hc36w7zz5waycsk220v0r83sg991gd5f5r937mvz44viql80sgm"; + khm = "1gb2nv5qdq5fz9w9xq4fj68p46b62sd1m986ra5qbnskxqizr12s"; + kir = "1b1ing6qqi8qqfh4xpk76rp4gxp69wdjdl5m777ayx3v02d7nhh3"; + kor = "1rldj6f8h1nn5wpx57b0ci7p0fnivnwzgaf0d3576xls26z2wcgv"; + kur = "1cp2pfd6g662gvxi7ywkxfbfq1lwbis888bf1gg8ynzy342mx1ic"; + lao = "03bdaxakmxpbbr9vsnbzzfksvm6js0l5i0ijwl71piqyxqjj1gxf"; + lat = "1q7v7drnwpna9k2l79jbdlxiv1j617rqzjc9d48h3lfrma5z97sj"; + lav = "0fxzyvw7n67rmw2irvlghkf1bii4w47200zv26p0v3a9dwvhc7sg"; + lit = "0f00ggjjqrl94kwwjmjqwajyfprsml0br8vhn2gvn11gaxvm52hm"; + mal = "1i83plhin3m6sq8p92vzlyng5z59gvvqypyh7rnmvdmm9rranx8a"; + mar = "0ay7q53yl3709crvn5l9c9jx7hw6m5d3x2crmvnvczsh83ayfdik"; + mkd = "1q1wadcr4j1dzssyyqz43qmizc6vfqkbivr6xi2p7p4h9rl11x73"; + mlt = "1qp4v6habak1l7xrw322wglvjjndrfp4j7bj8d4npwbzk1sh4s0h"; + msa = "048p6mkx9zr40s9s5vbi0gnizhvqwn0g8i1hf1l8db7igbax5xyj"; + mya = "17nyr5bd42kzvid3421n3mwckd49vzrjhjahd8rnfsmbsy1x382l"; + nep = "154375r32sdmvcnp1ckvgbp3wxvb2xiiypb8bxbsvrabrz4wzjqc"; + nld = "1clwbky71zkz55zd3f8r9hj8fhpnbkply80p1js4fvs7x12r715x"; + nor = "1ynvrz6s0vmlq1xkjd8k2w6bx8770x6v29qgx83d4nl17ngjd459"; + ori = "0dsakc8gnwhs6z5kxc2wdkbn31gkkiqk5vriw0swghychp164aac"; + osd = "1zq0dfliavglmix7zzrqdxz1w01rm1f1x1352bqn8xf4zivdbxcw"; + pan = "1fwdpwkydfmr6drwgkqzn89z12r2rdm02a75vvdxhxg2a9yiwmbv"; + pol = "155z870ygzws476kp7qpzi8jcjcv3jb5px8rbzhnag1fklqr48hx"; + por = "1814cff2rffpzlg4hyyrjzpf5ps2i95rmpa4c8ikblbvrlcv97q8"; + pus = "1iz5nn1zfvn1l9gb1jriwx991d2hwwc7x4k1nvzjlwpzscplx25b"; + ron = "11lr80zhvnnngvwwk01z1d3prfpbh3qbwpl1nl5fp7h09d6n3wzl"; + rus = "1d6a8lg4bmd3np16jds1py3qpkaq4ahnhwghd5r0159y0jpxq00q"; + san = "169f4ajgwn99yfdfrlwfvdgvv1abal7fpdp31sknvq8l7w2sak3g"; + sin = "1411g18r6f6j6f4n0sn7ajgs4gkplb892s6ak0hi9nyyxwv3r1gm"; + slk = "0bxfbrg1nf6px0xzkh6ihdi71fmr1rxxs99qb191k7pm16x2lpds"; + slk_frak = "0zyqnn1y5cyx1y7wzgw743k4584ljl0rhvk2q1ni6jnjx9ciwzqy"; + slv = "1kjn9m9hbwp0m0p2v8c3skpzr6f8x42hz8x48zl22550a7hq8n1h"; + spa = "1npgl8ylvfm60hd4214z8a3lriy1hckhijschrbjpzmwdfcqafgj"; + spa_old = "0w4ivkv8flyn7bjlyjcrcrdnslkvrrfs7l33mvird1jhhkyqd8sx"; + sqi = "15wzvh6qm3yx7yf0k5j7g1imsaqxvq7r2xh6a0xgmkqbyypbbkdf"; + srp = "05blqriv30x02c80ds3x7zhw0y21nc6lkqlv5jwgwnjgw4yfpgrm"; + srp_latn = "0ss8s3q60aq8sd2a3sbnzvp13qqarxnjw4hij8hd9ab5gsjw0nwr"; + swa = "1pwwhx7ldq21cv06cchws8gvwsmkwn5sjcy9z3nk3nbp9qjsf44f"; + swe = "0l10iyn2cr7ibgk0akmpg8725mpwpydawgv3s77izsw7y6xhfr1a"; + syr = "08bxil13wyp5h4hvbxjcys7ypgqgg46rrp653m7gyv5q94ycjgb0"; + tam = "1g155kyba2wjfgzgy48g6yd2csinwbfjdi5r7vw0wm3dh1z39dvz"; + tel = "0fydrcb54b6mmqazb337x4s36i2a64sb4xm7y7g3nqqmk9afsipv"; + tgk = "0f6j37friywj7y132fv0jm6aj4sx8f0b7brspj3pbjqqpi4v5ws0"; + tgl = "0f1r0gicif57qhyw8xaa1sqgny720q3z5cpd5srrn9i6fihaz577"; + tha = "1y2hw55jfpidk95y8qbsiczgg2r2khabac97s1y3gl0v93a44jna"; + tir = "1y7iryhjr83ca4yh5jjz7qlnrx4kbrp0a0p650whjvk2gnv8m98h"; + tur = "0xqnq99b2jb4v74bj95py6wmg14dm31zp5s3l48dmcv6zdgcxg2w"; + uig = "1sdddr15zlb33kd1d7hzi5lfd15bfhqn105d7x6snfpqp7vq4bxv"; + ukr = "0cdwjnfnnmzz7jdn49l96vqgaimclfxcxaw09cm63f5my382r2rg"; + urd = "10xcn1zs2lfswp5yai0ckyg7js587qhr5cf7qib3i35qjbw7nc18"; + uzb = "1jkkd5j6vsx5jv5gwprbfwg1vwh714prm8j446wzvp74brmk949l"; + uzb_cyrl = "1kdia38rgm2qd3ly80a412jyagxxryr09h1nz2d0iw71bmfn4855"; + vie = "1ja18jxxaw282y4jljxpjf1gj15il61vc2ykpfy22vn88wvydxff"; + yid = "1jddd0g8mm5v00z5kb8rbpfs7ppzgq9kzm1xlhhvv960yfdbi6fd"; + }; + }; + + v4 = makeLanguages { + tessdataRev = "4.0.0"; + tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28"; + all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg"; + + # Run `./fetch-language-hashes ` to generate these hashes + languages = { + afr = "1a9f8pnrspfmcq9gpjnxn2kkhjlsmh912bnpx671fjizxpmiri2y"; + amh = "0m1vdyxjx57kmf2qra0p31k509y1cqn4pyckzw00i5n3wx11d2j0"; + ara = "0nswl6n0s94g900j5k1gwzp7m140c0yd9a2fdb2lzhdvg1krf190"; + asm = "025d9vrjcrwyd6cc6hrw1x8xqhicgrb9wpvhhmlw71ql04dadslf"; + aze = "01shcs78a6xn3my8p3y42x1c9f5hzfn83w2n2nwpffbgz4y2nsgf"; + aze_cyrl = "1sbd89i5r7rnkjh2in8j0plrxnfiill9jl8pr68iw77ghih6q1vg"; + bel = "0dhyymsxcyzwal8474q7ag3m2akv0b92hkdz7rka5z1cxry1cn8c"; + ben = "0a7q9414k3frn37x2qcglz722ysg2iivj6kqaaa0ik7z14ibc8v0"; + bod = "0rh7x54nlh6ir6ldccj8hi7g8hwlp13r3fkljw8gndvhwmgfkkar"; + bos = "1szym4n605hlx12a9vpz4jjs76jscajh22rgkqwbv4qdsl0gi3nd"; + bre = "070f4c84iznblsw4jkwpzh9dss8nfb678160szm5r8dlv2yinrrk"; + bul = "03bg2yw79lg8rl43y9288313jrfh0h69vl4s4cmlgbmnbx8pvxwj"; + cat = "19xs691aj8yy2ff07c3gzm07zicd5ha0gmcjxjh9pknqf2gfy7qv"; + ceb = "1896vn41hqc4anm6hjvrnn022i0p8pmhwsp5rv9w2cvr6738l79r"; + ces = "0fh2g47msfr91285rnccxcmcshihm126sqy496s4vrr0vk8ix1nf"; + chi_sim = "0qxkvbpm5l7gzsshnn72wfx473pprf5nmw8hd4i4x2qxnfddh1gw"; + chi_sim_vert = "1f75pzvxbda82vxa2zb1z9b9f13sh81kzaw45vg5118ncsklj8w7"; + chi_tra = "056vjws1fir1v5iv44pzykkxs5q1dbb2j8blhj47i53w1zf6g42m"; + chi_tra_vert = "10c9cdycg1a5kwlgg60sh8yp07w2fl4whinpxfhlzrzs56allql4"; + chr = "19qq8a6c27973djsc4xpcklis92r58x21fg4mz5azdyka5i1n46l"; + cos = "0z9kx1hw8h5n00pcahxla808wya50wrkk8cz7x676pd93ibyrlyx"; + cym = "13pk9cpf43xxqbz3blfz2av2yd1ma6ds6jbdiqw8anhhj7l9ch2d"; + dan = "1jirmahxvyyswhhyzhinvcqaycz7m3ixchqrj3lgfcdi3anvabr2"; + dan_frak = "17wcgdqxmbzn7qchnx5gsa05aj4wmhbwk43w173bl3wr6h5ylmh0"; + deu = "194rqsg4nlycca9bg2fqf15xgcl110rxp182l7dbjfjhar4knsw9"; + deu_frak = "12hhhp32f15c7fw2jp05mwim9ps14kmamhh6vmalvm7r2033vbm7"; + div = "09mm9r5hxhsc4qpyg10ym9mc2kdpawx8zk0aiv1xpgd35rzpyz41"; + dzo = "1zk7crgcazgqy5zmslp6iw4jws07nja31qdxx0rpzhn3c0bjgw1b"; + ell = "1hhym18a9411953j47xjk47jx9ij9xi2qwlx05c93zl41528nsqg"; + eng = "0iy07z182lwhqfa0q288ha691scpsry330aynaizn68wcmywk86s"; + enm = "1dhr1qvil38bil43wk5ci645sbm3my2y9y7qlcbnwz2p4pflayvm"; + epo = "1jig4db7050vww32vxsqyig3j1b0vgz9ipxbsw0jpkjia84k44n9"; + equ = "02qwg6s1z7pynwm0p6dvpwi04ivfkr1s7qgssbla1dx7v0ih6rlg"; + est = "1jxygahy6by7fbirbmjmd68k6560q1a3h5mvpzdx15h5fw0q58gl"; + eus = "0cai7nm7si8680avrrls8bf9ski980rvsj560fh9y6n9rz7mh9mp"; + fao = "1n3434jf18bzakbylzyg3jaw2ad4h376g56dsql32bgh2yvyww8a"; + fas = "17wjkfka9725rz32clgqgk9msmbz4axs59vz30jmhhxyrkliafqb"; + fil = "0p713k8g27df9z384ns111xqxii5kq20m8brflsmd3yckw1mibhz"; + fin = "1wc3y9nnm7rb2c2c5fkj7cv7jb27jlkb2bh0g8kaz57h6imfmb2g"; + fra = "04qrfvi6irlaahh1pgn5azyfhbhavm12yyybza8603alf8firh7a"; + frk = "05cqmxxxjqdl5hjyzi6dpmixnjpd6f3jr6741yapdmnxvkzxkiyp"; + frm = "0a86yy6hd0lvlbzvnzjmyapzc0rn7mnkdadqycd65bw1b714cvy2"; + fry = "0i84r8g9hlkr9nlhypl4lq6ncrhbcpskqkdcijgk88c2fdknh57h"; + gla = "17idyhb505waz9dnb8dsk54faw7y0xvvb12yw71k0skq3i90akar"; + gle = "1q87h5zzcva54pg364d3hl6q9hdlydlyj1qmq8n5k7hqk11msxmk"; + glg = "01xssz1rhpy3a0sm4i43nba61wc2srz6wv327vdw1kg8ijm0s0g4"; + grc = "00x0s3smx4wg5h12y2b9al0j2jk1y3f0yy2x6f2qf7ps831drgyl"; + guj = "028v4fgn0zi2044vk6j2rlqklc9i0kj22s52vhifmx1g02kz9154"; + hat = "1bca516pr2cnyjlwycc7pr6gfmdjb8565hp06pw9nwpr20ry0hss"; + heb = "1qfkffjh29b21frs0mv6llsrchixl5kjkpj1if7fq816g9mym9kx"; + hin = "1rkfam5c6qil2590lfffzndhq3bncdgf4ij0cyjcglgyljgx0xnc"; + hrv = "0da7b6mk0rwc9zlbqkycwjpddp3qpy07l643i00ia5a1zq35fmgp"; + hun = "0w2s4mn9p74zqzmp9hh2017zgsh5v43k4lid4pv29f4b0y5gj9xi"; + hye = "0ifzm875wlbjh4vkpmj1n6f14m8i174413l6pc6i44y4p5fpgxrf"; + iku = "19arnv82xbxhbcy8pf9fv1sl5zc5707mk34nh7w46dlz86qkidmn"; + ind = "1d421hizwni4m6sr4f3nqqpr1g744hzn0krk130m7x8mhzgamba5"; + isl = "1hjjw8k2r9qa990ziq5wxr36kyf16mnmrqfmq5vbcjprka9h08pq"; + ita = "1qyrvlf7pjxzyb29sc7aq3gq61bww14sijka44scxggfw7134l3r"; + ita_old = "1pf8461jbj0vpyry0b54crmkf2bk9mh4klxvmj09jvf0aq2vm9s6"; + jav = "18vvbyimj0y462amjmwvqa6h9n8l122j9v0w3hfp63hlxpfprm0m"; + jpn = "16hma9w32vdh41ihymp894jza72b0d235hwriv18r78j5n86nhbg"; + jpn_vert = "0yca09l9sbpfjgb2slnpb9q7qd7vz3a1wb6bkln30d3nl0d9r1rn"; + kan = "0lcmx37rjfxkbhhbrld1ndmkwkm9w9b3pzxhas0cv5dqsx2f84jd"; + kat = "1b164bgwa7bbvw4177h8fxfh0fbh4bycfl9pkaa184dpjpaiqpia"; + kat_old = "1mgff7sh93hdp3wh0ckikdggrdgf0syp75s39pickpbkp9ic41ai"; + kaz = "0h37y0kb5lwsp5zpl7bvxg3ryqldl5hxfnardliwgyqgnag951vi"; + khm = "0m7x1fynr18sid2kjjw8xa9ika0a0fc6a6hvc7ihizi47893hdfb"; + kir = "09kxwqpqf6kxjii07qlqsiii83zk12rszp88xnzzjp8rjsnk78s3"; + kor = "0nsr43fwrp9876ia1fc0zcviv2n8hw16n0wfh158vhygwglvy84m"; + kor_vert = "1wmvdznmikk9fq7wdffvn22scxmcl26vjh26jhicqwxpc7kg4bh8"; + kur = "0gbsf3ny3n5mgb30v54bz3crgnimdpg19jn633pbpzryzg3xhd25"; + kur_ara = "1sbj0cczhi9q119fbzpi0m6zr9kjp3k76bv9w8szkv1wc5y4fng6"; + lao = "1gvxlg8bw3a4c9izg3c2a2yl7q6rsy7z9y64axdw9a04pz2ndbl5"; + lat = "0b7an3q3xrf9c55bhiqqh7l45ga88l0kwvkp1akmlr98piach3vr"; + lav = "0fqsmy47cygamddxyjfrdgkfa9bvmrvf4csvppnkdvfzy6iiv0c2"; + lit = "0wjgbkwc3bf5khdqali7ylnhhs4xvpx19m3zx2y9s27v2wjbb6kv"; + ltz = "02zdxbniiqfl87fzsiaaqgldqfsv15z5hja1xhxnqpl0nds7shfc"; + mal = "0a41ifz8i6lj2ywxjkwvymxzxahkz2cjv4apbrawdj1h42bn7frd"; + mar = "00swhlh9bckvmlxanfmlw5j4n9qqhggl84bsq0827bmijsqwnl44"; + mkd = "1bqfiwxlzfpz4fs4z5ci2wbv01qhrcayk1inmk3dxq7dsywx1ajg"; + mlt = "1rmmga2aw88hr7q7cfr5cvhnsgnf1mi069d5k7z66zp4vzbl4zyz"; + mon = "1jksvcavn9plsmjdmhg40mwq5rlvrd1b9gvghdjg7zkf6qqqynlh"; + mri = "0jlfawx20s5clsnk82ndy3v2zidh4cfh4acrh8nindk21xmiwh5i"; + msa = "0m7zs8anaa3l4z5f3xvbhs4syp41dp4all2yfpi1plyr0hy784an"; + mya = "0hljm5haadlr4k5rhw4mvhkygcnrr709rvl7amz7av3nskmi8mb1"; + nep = "1dhy0m2h6xfgwibf92iwxsn926dmrhfvkg9rafkdaqcr4pq6w563"; + nld = "0bspf5bv1s7qzm6k4aqbpq91zvk4kxxhx5zv08w91xfsa1zpdxmi"; + nor = "08majhc9m0fjvac50yq52ia2af9kscclimwkv403klnj4kgf8ndq"; + oci = "1mzrw9gsdjrd1xj3zv7l5gzgjq5jrygxf8cfkz20d9lls0wj1xdv"; + ori = "1sh42mjzb1hv6l6lljp3wifjmz7wrv818f9f16m8qjikwqxm0s78"; + osd = "03mvfk1q1xp1klpf4bwna903rnp51bkqr3gl5hvxybvrc3l2m7z1"; + pan = "0165kr94p6x5yxzs4p8sfppvg9cywp65ps0xaym5rqz9iashz32h"; + pol = "0g0b71ms6ddgykmkna4mlavgzgmh9vj6s62fi8l4ja93nfpr37hp"; + por = "132jbhzmcsq8skanm15bw2niyx9xpbrqr411wn7w9r5i3cvnlv01"; + pus = "0iiglnkn478al11avigsav625pn7ifscycnxpj6fg8835vjww3xr"; + que = "01vkmfi9idjwskv5pllmrxpil0v5h7f7rzv5viclxrzkmbvrz9b5"; + ron = "0ag6vs0cn3sryavs1mfrallgdgi4h28114g7m61rhlhq0z484g0m"; + rus = "1hippm3w5d73sh50r136x0xff2p6x128ry2x4fywf6xdpv1f46v8"; + san = "1qlpqkr5c5wqcf1bvlipy72advqnvd4wm61vghmrj2sda8mx87sx"; + sin = "097d2s4ma0zsq0ab5qs1ylgl9l5phw91fnpsvb7vjmz2mw3ic964"; + slk = "0c97pp5iffhdzyma605x8q3rx1qq9pq2h6cai1kppaj92rz3ji9k"; + slk_frak = "16ivsam1g18zlpw6pgidvzwb7h8rvw1s10nigs6yfwir8hjxsgki"; + slv = "0644jlm55p0dg4zchgrashmbv36zb4x649ckmf2jkbss8bzx7wsf"; + snd = "1i2mfi4414l3v9nznjy7959y2jcr8ymvf6w8zpyrw6nad4d1aak7"; + spa = "15kwvr7cpcnlxm1ja1yyc022dmsd04gmk7h1p0df12aicsscn3qb"; + spa_old = "1jq80c4mi3rmwnfhb3mbaaq0ci101mgbibkji9ala4l5dkcwjra3"; + sqi = "19cvvixhz9906p4c9i2grpr386rbp5alp4fp14xm9nd81bmq4701"; + srp = "1jd25n13h6vxsa3gzbj6q6mdh02rjl4qrd1bffr5psp33asqvw0l"; + srp_latn = "1k7577mn3z0bm5ma9d8l14sn5wpvw50hq1nxwbc36yn3a5b3mhiz"; + sun = "0lvlaw3jfvr7b5v09669kq8mm19jdsk9g5h09jsa2gr6fvsq11pa"; + swa = "0qy9qc5pa1dzzqrh1z40gk845z1r4d2smywnzydknbb3n240lhz0"; + swe = "1y56r7bgzw0pqkdylbah07r1f0v03sblkggiql8x5200rhaxvqi4"; + syr = "1vfj5fsiv170jghryrxwyz0i9mdsaki1kglxrklkb2caal9kwy38"; + tam = "0rhhdbnp0a2hpg00vpc0xyxcl2w36i1kn63mrvwx1f9q7m3y1fmf"; + tat = "0a74rp8pyp4yivv2xcy2m8xgwch8scr3wmk1fzniwzf43fsrqp76"; + tel = "0gcq8hxhxvilyh7x7kiikq07hllqysc8sfyr88gvpj4xi092h2bx"; + tgk = "1458gk0k6gk49n8lr6fj7l7cwkhxn0lrhybzq10zl1ly7yzjhf67"; + tgl = "12yscwckdy3l21mvsrj1021gxw2isjrg369r08rsf7lh96wn4wkn"; + tha = "01f0j7gsc5slxaaql1gqbhk4wlwaxc29dlmfxwjzikxc46gjl0w8"; + tir = "1q6w48b1jchv55713pq20inzjjdymh32fw8wxfaj1qi7bjqfb9fk"; + ton = "06g60ga8rys8jaimqrvd4svh40qs1nz4bszdnf2hdv05ibryibdq"; + tur = "0g9g1wvibp61qbriy8ys948yfkl88xk9g8f93bnq8w8dx029b6s8"; + uig = "09sajx21lw3a3ph62dyqr10pjaq2mij10sdhkhvvjiydk34dn548"; + ukr = "14q8ls8gkrg7c9pc6qzm6yf5ady3i3303vs1hz4d2idcl6yry334"; + urd = "15vszhqraxqdcng1069p6i4xq3ck3904q207nkbap6dfpcpjig40"; + uzb = "03hyw0vavmjirqs4wkd5r85g91w2avsyl14z624fhm3gc66pqg7n"; + uzb_cyrl = "1433lrrp2lfgb1k0a4sc20b35b2jcl8f1z92vm2936y7w04xpaq7"; + vie = "02k40d3wji74d1jgvkr3zrn9gpzlmp0lqhrrdmc48r2sgvnrnk8n"; + yid = "0xnbvi04xv1qapqg72wa3bjwbw51pkdnyncjpjp37vn6dzh04l0z"; + yor = "07w3aci52ng6i6nyp97q5zb2dqlj08w6im90y1h691qah1x44zlv"; + }; + }; +} diff --git a/pkgs/applications/graphics/tesseract/tesseract3.nix b/pkgs/applications/graphics/tesseract/tesseract3.nix new file mode 100644 index 000000000000..db0e06434aa9 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/tesseract3.nix @@ -0,0 +1,29 @@ +{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig +, leptonica, libpng, libtiff, icu, pango, opencl-headers }: + +stdenv.mkDerivation rec { + name = "tesseract-${version}"; + version = "3.05.00"; + + src = fetchFromGitHub { + owner = "tesseract-ocr"; + repo = "tesseract"; + rev = version; + sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30"; + }; + + enableParallelBuilding = true; + + nativeBuildInputs = [ pkgconfig autoreconfHook ]; + buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; + + LIBLEPT_HEADERSDIR = "${leptonica}/include"; + + meta = { + description = "OCR engine"; + homepage = https://github.com/tesseract-ocr/tesseract; + license = stdenv.lib.licenses.asl20; + maintainers = with stdenv.lib.maintainers; [ viric earvstedt ]; + platforms = with stdenv.lib.platforms; linux ++ darwin; + }; +} diff --git a/pkgs/applications/graphics/tesseract/tesseract4.nix b/pkgs/applications/graphics/tesseract/tesseract4.nix new file mode 100644 index 000000000000..df321023c741 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/tesseract4.nix @@ -0,0 +1,27 @@ +{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig +, leptonica, libpng, libtiff, icu, pango, opencl-headers }: + +stdenv.mkDerivation rec { + name = "tesseract-${version}"; + version = "4.0.0"; + + src = fetchFromGitHub { + owner = "tesseract-ocr"; + repo = "tesseract"; + rev = version; + sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8"; + }; + + enableParallelBuilding = true; + + nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ]; + buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ]; + + meta = { + description = "OCR engine"; + homepage = https://github.com/tesseract-ocr/tesseract; + license = stdenv.lib.licenses.asl20; + maintainers = with stdenv.lib.maintainers; [ viric earvstedt ]; + platforms = with stdenv.lib.platforms; linux ++ darwin; + }; +} diff --git a/pkgs/applications/graphics/tesseract/wrapper.nix b/pkgs/applications/graphics/tesseract/wrapper.nix new file mode 100644 index 000000000000..365d68a9ee76 --- /dev/null +++ b/pkgs/applications/graphics/tesseract/wrapper.nix @@ -0,0 +1,58 @@ +{ stdenv, makeWrapper, tesseractBase, languages + +# A list of languages like [ "eng" "spa" … ] or `null` for all available languages +, enableLanguages ? null + +# A list of files or a directory containing files +, tessdata ? (if enableLanguages == null then languages.all + else map (lang: languages.${lang}) enableLanguages) + +# This argument is obsolete +, enableLanguagesHash ? null +}: + +let + passthru = { inherit tesseractBase languages tessdata; }; + + tesseractWithData = tesseractBase.overrideAttrs (_: { + inherit tesseractBase tessdata; + + buildInputs = [ makeWrapper ]; + + buildCommand = '' + makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata + + # Recursively link include, share + cp -rs --no-preserve=mode $tesseractBase/{include,share} $out + + cp -r --no-preserve=mode $tesseractBase/lib $out + # Fixup the store paths in lib so that the tessdata from this derivation is used. + if (( ''${#tesseractBase} != ''${#out} )); then + echo "Can't replace store paths due to differing lengths" + exit 1 + fi + find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \; + + if [[ -d "$tessdata" ]]; then + ln -s $tessdata/* $out/share/tessdata + else + for lang in $tessdata; do + ln -s $lang $out/share/tessdata/''${lang#/nix/store*-} + done + fi + + if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then + # This is a bug in Tesseract's internal tessdata discovery mechanism + echo "eng.traineddata must be present in tessdata for Tesseract to work" + exit 1 + fi + ''; + }); + + tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru; +in + if enableLanguagesHash == null then + tesseract + else + stdenv.lib.warn "Argument `enableLanguagesHash` is obsolete and can be removed." + tesseract diff --git a/pkgs/applications/misc/k2pdfopt/default.nix b/pkgs/applications/misc/k2pdfopt/default.nix index 0e84283a9ef7..8f69abd3a987 100644 --- a/pkgs/applications/misc/k2pdfopt/default.nix +++ b/pkgs/applications/misc/k2pdfopt/default.nix @@ -75,19 +75,21 @@ stdenv.mkDerivation rec { cp ${src}/leptonica_mod/* src/ ''; }); - tesseract_modded = tesseract.overrideAttrs (attrs: { - prePatch = '' - cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/ - cp ${src}/tesseract_mod/dawg.cpp api/ - cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/ - cp ${src}/tesseract_mod/openclwrapper.h opencl/ - cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/ - cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/ - cp ${src}/tesseract_mod/tesscapi.cpp api/ - cp ${src}/include_mod/{tesseract.h,leptonica.h} api/ - ''; - patches = [ ./tesseract.patch ]; - }); + tesseract_modded = tesseract.override { + tesseractBase = tesseract.tesseractBase.overrideAttrs (_: { + prePatch = '' + cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/ + cp ${src}/tesseract_mod/dawg.cpp api/ + cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/ + cp ${src}/tesseract_mod/openclwrapper.h opencl/ + cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/ + cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/ + cp ${src}/tesseract_mod/tesscapi.cpp api/ + cp ${src}/include_mod/{tesseract.h,leptonica.h} api/ + ''; + patches = [ ./tesseract.patch ]; + }); + }; in [ zlib libpng ] ++ optional enableGSL gsl ++ diff --git a/pkgs/top-level/aliases.nix b/pkgs/top-level/aliases.nix index 8360bf447064..b56640049d87 100644 --- a/pkgs/top-level/aliases.nix +++ b/pkgs/top-level/aliases.nix @@ -306,6 +306,7 @@ mapAliases ({ terraform-provider-ibm = terraform-providers.ibm; # added 2018-09-28 terraform-provider-libvirt = terraform-providers.libvirt; # added 2018-09-28 terraform-provider-nixos = terraform-providers.nixos; # added 2018-09-28 + tesseract_4 = tesseract4; # added 2018-12-19 tex-gyre-bonum-math = tex-gyre-math.bonum; # added 2018-04-03 tex-gyre-pagella-math = tex-gyre-math.pagella; # added 2018-04-03 tex-gyre-schola-math = tex-gyre-math.schola; # added 2018-04-03 diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index fccd863b9c7a..683a91420713 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -19523,8 +19523,10 @@ in termtosvg = callPackage ../tools/misc/termtosvg { }; - tesseract = callPackage ../applications/graphics/tesseract { }; - tesseract_4 = lowPrio (callPackage ../applications/graphics/tesseract/4.x.nix { }); + inherit (callPackage ../applications/graphics/tesseract {}) + tesseract3 + tesseract4; + tesseract = tesseract3; tetraproc = callPackage ../applications/audio/tetraproc { };