3
0
Fork 0
forked from mirrors/nixpkgs
nixpkgs/pkgs/development/python-modules/pyocr/paths.patch

349 lines
15 KiB
Diff
Raw Normal View History

2020-03-10 12:00:00 +00:00
diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py
index 2e5b717..35647e2 100644
--- a/src/pyocr/cuneiform.py
+++ b/src/pyocr/cuneiform.py
@@ -25,13 +25,9 @@ from . import builders
from .error import CuneiformError
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
-CUNEIFORM_CMD = 'cuneiform'
2018-09-10 16:27:56 +01:00
+CUNEIFORM_CMD = '@cuneiform@/bin/cuneiform'
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
-CUNEIFORM_DATA_POSSIBLE_PATHS = [
- "/usr/local/share/cuneiform",
- "/usr/share/cuneiform",
-]
2018-09-10 16:27:56 +01:00
+CUNEIFORM_DATA_POSSIBLE_PATHS = ['@cuneiform@/share/cuneiform']
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
LANGUAGES_LINE_PREFIX = "Supported languages: "
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
2020-03-10 12:00:00 +00:00
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
index a068e73..9ebea5c 100644
--- a/src/pyocr/libtesseract/tesseract_raw.py
+++ b/src/pyocr/libtesseract/tesseract_raw.py
@@ -2,7 +2,6 @@ import ctypes
import locale
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
import logging
import os
-import sys
from ..error import TesseractError
2020-03-10 12:00:00 +00:00
@@ -10,48 +9,16 @@ from ..error import TesseractError
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
logger = logging.getLogger(__name__)
2020-03-10 12:00:00 +00:00
TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
-libnames = []
2020-03-10 12:00:00 +00:00
+if TESSDATA_PREFIX is None:
+ TESSDATA_PREFIX = '@tesseract@/share/tessdata'
+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
+
+
# 70 is the minimum credible dpi for tesseract and force it to compute an
# estimate of the image dpi
DPI_DEFAULT = 70
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
-
2020-03-10 12:00:00 +00:00
-if getattr(sys, 'frozen', False): # pragma: no cover
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
- # Pyinstaller integration
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")]
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")]
- tessdata = os.path.join(sys._MEIPASS, "data")
- if not os.path.exists(os.path.join(tessdata, "tessdata")):
- logger.warning(
- "Running from container, but no tessdata ({}) found !".format(
- tessdata
- )
- )
- else:
- TESSDATA_PREFIX = tessdata
-
-
2020-03-10 12:00:00 +00:00
-if sys.platform[:3] == "win": # pragma: no cover
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
- libnames += [
- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on
- # Windows ?
- "../vs2010/DLL_Release/libtesseract302.dll",
2018-09-10 16:27:56 +01:00
- # prefer the most recent first
- "libtesseract305.dll",
- "libtesseract304.dll",
- "libtesseract303.dll",
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
- "libtesseract302.dll",
2018-09-10 16:27:56 +01:00
- "libtesseract400.dll", # Tesseract 4 is still in alpha stage
- "libtesseract.dll",
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-4.dll",
- "C:\\Program Files (x86)\\Tesseract-OCR\\libtesseract-3.dll",
- ]
-else:
- libnames += [
- "libtesseract.so.4",
- "libtesseract.so.3",
- ]
-
2018-09-10 16:27:56 +01:00
+libnames = [ "@tesseract@/lib/libtesseract.so" ]
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
2018-09-10 16:27:56 +01:00
g_libtesseract = None
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
2020-03-10 12:00:00 +00:00
@@ -364,12 +331,12 @@ def init(lang=None):
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
try:
if lang:
lang = lang.encode("utf-8")
- prefix = None
2020-03-10 12:00:00 +00:00
- if TESSDATA_PREFIX: # pragma: no cover
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
- prefix = TESSDATA_PREFIX.encode("utf-8")
2020-03-10 12:00:00 +00:00
+
+ prefix = TESSDATA_PREFIX
+
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
g_libtesseract.TessBaseAPIInit3(
ctypes.c_void_p(handle),
- ctypes.c_char_p(prefix),
+ ctypes.c_char_p(prefix.encode('utf-8')),
ctypes.c_char_p(lang)
)
g_libtesseract.TessBaseAPISetVariable(
2020-03-10 12:00:00 +00:00
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
index 7c30852..44e8446 100644
--- a/src/pyocr/tesseract.py
+++ b/src/pyocr/tesseract.py
@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
from .error import TesseractError # backward compatibility
from .util import digits_only
-# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
-TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract'
2018-09-10 16:27:56 +01:00
+TESSERACT_CMD = '@tesseract@/bin/tesseract'
python/pyocr: 0.4.6 -> 0.4.7 Upstream changes: * Tesseract 4.00.00alpha: * Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0)) * Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3 if available * Support for Tesseract 3.05.00: * Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf' * Libtesseract: If available, use TessBaseAPIDetectOrientationScript() instead of TessBaseAPIDetectOS * Libtesseract: * Workaround: Prevents possible segfault in image_to_string() when the target language is not available Full upstream change log can be found at: https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog The tesseract.patch for supporting Tesseract version 3.05.00 has been applied upstream and we can safely drop it. We now use substituteInPlace in conjunction with a patch to insert the relevant store paths instead of sed, so it's less fragile whenever we have upstream changes in handling of these paths. I've tested this by reverting 48a941e29faa95e897f and applying a build fix patch of Cuneiform 1.1.0 from Arch Linux, because right now Cuneiform is an experimental version that can't be fixed on behalf of pyocr (the reason is that pyocr needs to get a list of languages, which doesn't work in that version anymore). In addition to that I've successfully built paperwork-backend which by now is the one package which depends on pyocr. However, I didn't do runtime tests of Paperwork. Signed-off-by: aszlig <aszlig@redmoonstudios.org> Cc: @7c6f434c
2017-09-02 04:18:38 +01:00
TESSDATA_EXTENSION = ".traineddata"
2020-03-10 12:00:00 +00:00
diff --git a/tests/tests_cuneiform.py b/tests/tests_cuneiform.py
index 45b7f6a..95f55c6 100644
--- a/tests/tests_cuneiform.py
+++ b/tests/tests_cuneiform.py
@@ -21,7 +21,7 @@ class TestCuneiform(BaseTest):
# XXX is it useful?
which.return_value = True
self.assertTrue(cuneiform.is_available())
- which.assert_called_once_with("cuneiform")
+ which.assert_called_once_with("@cuneiform@/bin/cuneiform")
@patch("subprocess.Popen")
def test_version(self, popen):
@@ -54,7 +54,7 @@ class TestCuneiform(BaseTest):
self.assertIn("eng", langs)
self.assertIn("fra", langs)
popen.assert_called_once_with(
- ["cuneiform", "-l"],
+ ["@cuneiform@/bin/cuneiform", "-l"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -109,7 +109,7 @@ class TestCuneiformTxt(BaseTest):
output = cuneiform.image_to_string(self.image)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -125,7 +125,7 @@ class TestCuneiformTxt(BaseTest):
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
+ ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
"-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
@@ -142,7 +142,7 @@ class TestCuneiformTxt(BaseTest):
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -173,7 +173,7 @@ class TestCuneiformTxt(BaseTest):
output = cuneiform.image_to_string(image, builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -227,7 +227,7 @@ class TestCuneiformWordBox(BaseTest):
output = cuneiform.image_to_string(self.image,
builder=self.builder)
popen.assert_called_once_with(
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -280,7 +280,7 @@ class TestCuneiformLineBox(BaseTest):
output = cuneiform.image_to_string(self.image,
builder=self.builder)
popen.assert_called_once_with(
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
index ad7fdc9..57e7a60 100644
--- a/tests/tests_libtesseract.py
+++ b/tests/tests_libtesseract.py
@@ -165,7 +165,8 @@ class TestLibTesseractRaw(BaseTest):
args = libtess.TessBaseAPIInit3.call_args[0]
self.assertEqual(len(args), 3)
self.assertEqual(args[0].value, self.handle)
- self.assertEqual(args[1].value, None)
+ # we hardcode tesseract data, so we don't get None
+ #self.assertEqual(args[1].value, None)
self.assertEqual(args[2].value, lang.encode() if lang else None)
self.assertEqual(
@@ -201,7 +202,8 @@ class TestLibTesseractRaw(BaseTest):
args = libtess.TessBaseAPIInit3.call_args[0]
self.assertEqual(len(args), 3)
self.assertEqual(args[0].value, self.handle)
- self.assertEqual(args[1].value, None)
+ # we hardcode tesseract data, so we don't get None
+ #self.assertEqual(args[1].value, None)
self.assertEqual(args[2].value, lang.encode() if lang else None)
self.assertEqual(
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
index 1a55567..a24d96f 100644
--- a/tests/tests_tesseract.py
+++ b/tests/tests_tesseract.py
@@ -36,7 +36,7 @@ class TestTesseract(BaseTest):
def test_available(self, which):
which.return_value = True
self.assertTrue(tesseract.is_available())
- which.assert_called_once_with("tesseract")
+ which.assert_called_once_with("@tesseract@/bin/tesseract")
@patch("subprocess.Popen")
def test_version_error(self, popen):
@@ -156,7 +156,7 @@ class TestTesseract(BaseTest):
for lang in ("eng", "fra", "jpn", "osd"):
self.assertIn(lang, langs)
popen.assert_called_once_with(
- ["tesseract", "--list-langs"],
+ ["@tesseract@/bin/tesseract", "--list-langs"],
startupinfo=None, creationflags=0,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -171,7 +171,7 @@ class TestTesseract(BaseTest):
self.assertEqual(te.exception.status, 1)
self.assertEqual("unable to get languages", te.exception.message)
popen.assert_called_once_with(
- ["tesseract", "--list-langs"],
+ ["@tesseract@/bin/tesseract", "--list-langs"],
startupinfo=None, creationflags=0,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -248,7 +248,7 @@ class TestTesseract(BaseTest):
self.assertEqual(status, 0)
self.assertEqual(error, message)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "output"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "output"],
cwd=tmpdir,
startupinfo=None,
creationflags=0,
@@ -271,7 +271,7 @@ class TestTesseract(BaseTest):
self.assertEqual(status, 0)
self.assertEqual(error, message)
popen.assert_called_with(
- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
cwd=tmpdir,
startupinfo=None,
creationflags=0,
@@ -302,7 +302,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -338,7 +338,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -371,7 +371,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout",
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout",
"--psm", "0", "-l", "osd"],
stdin=subprocess.PIPE,
shell=False,
@@ -399,7 +399,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -433,7 +433,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -467,7 +467,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -500,7 +500,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -527,7 +527,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -561,7 +561,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,