chore: function to map between standard and Tesseract language codes (#…

…1421) ### Summary In order to convert between incompatible language codes from packages used for OCR, this change adds a function to map between any standard language codes and tesseract OCR specific codes. Users can input language information to `languages` in any Tesseract-supported langcode or any ISO 639 standard language code. ### Details - Introduces the [python-iso639](https://pypi.org/project/python-iso639/) package for matching standard language codes. Recompiles all dependencies. - If a language is not already supplied by the user as a Tesseract specific langcode, supplies all possible script/orthography variants of the language to the Tesseract OCR agent. ### Test Added many unit tests for a variety of language combinations, special cases, and variants. For general testing, call partition functions with any lang codes in the languages parameter (Tesseract or standard). for example, ``` from unstructured.partition.auto import partition elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"]) print("\n\n".join([str(el) for el in elements])) ``` should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
Unstructured-IO · Sep 18, 2023 · eb8ce89 · eb8ce89
1 parent 3a07d1e
commit eb8ce89
Show file tree

Hide file tree

Showing 14 changed files with 292 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Enhancements
 
+* **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code.
+
 ### Features
 
 ### Fixes
@@ -62,7 +64,7 @@
 * Update all connectors to use new downstream architecture
   * New click type added to parse comma-delimited string inputs
   * Some CLI options renamed
-
+ 
 ### Features
 
 ### Fixes

diff --git a/example-docs/chi_sim_image.jpeg b/example-docs/chi_sim_image.jpeg
diff --git a/example-docs/jpn-vert.jpeg b/example-docs/jpn-vert.jpeg
diff --git a/requirements/base.in b/requirements/base.in
@@ -9,3 +9,4 @@ requests
 beautifulsoup4
 emoji
 dataclasses-json
+python-iso639
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -36,6 +36,8 @@ nltk==3.8.1
     # via -r requirements/base.in
 packaging==23.1
     # via marshmallow
+python-iso639==2023.6.15
+    # via -r requirements/base.in
 python-magic==0.4.27
     # via -r requirements/base.in
 regex==2023.8.8

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -207,7 +207,7 @@ nbformat==5.9.2
     #   jupyter-server
     #   nbclient
     #   nbconvert
-nest-asyncio==1.5.7
+nest-asyncio==1.5.8
     # via ipykernel
 nodeenv==1.8.0
     # via pre-commit

diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -27,7 +27,7 @@ click==8.1.7
     # via
     #   -c requirements/base.txt
     #   flask
-contourpy==1.1.0
+contourpy==1.1.1
     # via matplotlib
 cssselect==1.2.0
     # via premailer
@@ -148,7 +148,7 @@ psutil==5.9.5
     # via visualdl
 pyclipper==1.3.0.post5
     # via unstructured-paddleocr
-pycryptodome==3.18.0
+pycryptodome==3.19.0
     # via bce-python-sdk
 pyparsing==3.0.9
     # via

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -20,7 +20,7 @@ charset-normalizer==3.2.0
     #   requests
 coloredlogs==15.0.1
     # via onnxruntime
-contourpy==1.1.0
+contourpy==1.1.1
     # via matplotlib
 cryptography==41.0.3
     # via pdfminer-six
@@ -124,7 +124,7 @@ pillow==10.0.1
     #   pytesseract
     #   torchvision
     #   unstructured-pytesseract
-portalocker==2.7.0
+portalocker==2.8.2
     # via iopath
 protobuf==4.23.4
     # via

diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile requirements/ingest-azure.in
 #
-adlfs==2023.8.0
+adlfs==2023.9.0
     # via -r requirements/ingest-azure.in
 aiohttp==3.8.5
     # via adlfs
@@ -71,7 +71,7 @@ multidict==6.0.4
     # via
     #   aiohttp
     #   yarl
-portalocker==2.7.0
+portalocker==2.8.2
     # via msal-extensions
 pycparser==2.21
     # via cffi

diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -430,6 +430,20 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
                 assert point[1] is not math.nan
 
 
+def test_partition_image_formats_languages_for_tesseract():
+    filename = "example-docs/jpn-vert.jpeg"
+    with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
+        image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
+        mock_process.assert_called_once_with(
+            filename,
+            is_image=True,
+            ocr_languages="jpn_vert",
+            ocr_mode="entire_page",
+            extract_tables=False,
+            model_name=None,
+        )
+
+
 def test_partition_image_warns_with_ocr_languages(caplog):
     filename = "example-docs/layout-parser-paper-fast.jpg"
     image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")

diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -840,6 +840,20 @@ def test_add_chunking_strategy_on_partition_pdf(
     assert chunk_elements == chunks
 
 
+def test_partition_pdf_formats_languages_for_tesseract():
+    filename = "example-docs/DA-1p.pdf"
+    with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
+        pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"])
+        mock_process.assert_called_once_with(
+            filename,
+            is_image=False,
+            ocr_languages="eng",
+            ocr_mode="entire_page",
+            extract_tables=False,
+            model_name=None,
+        )
+
+
 def test_partition_pdf_warns_with_ocr_languages(caplog):
     filename = "example-docs/chevron-page.pdf"
     pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng")

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -369,6 +369,22 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
     assert elements[1].text.startswith("Zejiang Shen")
 
 
+def test_auto_partition_formats_languages_for_tesseract():
+    filename = "example-docs/chi_sim_image.jpeg"
+    with patch(
+        "unstructured_inference.inference.layout.process_file_with_model",
+    ) as mock_process_file_with_model:
+        partition(filename, strategy="hi_res", languages=["zh"])
+        mock_process_file_with_model.assert_called_once_with(
+            filename,
+            is_image=True,
+            ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
+            ocr_mode="entire_page",
+            extract_tables=False,
+            model_name=None,
+        )
+
+
 def test_auto_partition_warns_with_ocr_languages(caplog):
     filename = "example-docs/chevron-page.pdf"
     partition(filename=filename, strategy="hi_res", ocr_languages="eng")

diff --git a/test_unstructured/partition/test_lang.py b/test_unstructured/partition/test_lang.py
@@ -0,0 +1,47 @@
+from unstructured.partition import lang
+
+
+def test_prepare_languages_for_tesseract_with_one_language():
+    languages = ["en"]
+    assert lang.prepare_languages_for_tesseract(languages) == "eng"
+
+
+def test_prepare_languages_for_tesseract_special_case():
+    languages = ["osd"]
+    assert lang.prepare_languages_for_tesseract(languages) == "osd"
+
+    languages = ["equ"]
+    assert lang.prepare_languages_for_tesseract(languages) == "equ"
+
+
+def test_prepare_languages_for_tesseract_removes_empty_inputs():
+    languages = ["kbd", "es"]
+    assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"
+
+
+def test_prepare_languages_for_tesseract_includes_variants():
+    languages = ["chi"]
+    assert (
+        lang.prepare_languages_for_tesseract(languages)
+        == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
+    )
+
+
+def test_prepare_languages_for_tesseract_with_multiple_languages():
+    languages = ["ja", "afr", "en", "equ"]
+    assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
+
+
+def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
+    languages = ["zzz", "chi"]
+    assert (
+        lang.prepare_languages_for_tesseract(languages)
+        == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
+    )
+    assert "not a valid standard language code" in caplog.text
+
+
+def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
+    languages = ["kbd", "eng"]
+    assert lang.prepare_languages_for_tesseract(languages) == "eng"
+    assert "not a language supported by Tesseract" in caplog.text