Skip to content

Commit

Permalink
chore: function to map between standard and Tesseract language codes (#…
Browse files Browse the repository at this point in the history
…1421)

### Summary
In order to convert between incompatible language codes from packages
used for OCR, this change adds a function to map between any standard
language codes and tesseract OCR specific codes. Users can input
language information to `languages` in any Tesseract-supported langcode
or any ISO 639 standard language code.

### Details
- Introduces the
[python-iso639](https://pypi.org/project/python-iso639/) package for
matching standard language codes. Recompiles all dependencies.
- If a language is not already supplied by the user as a Tesseract
specific langcode, supplies all possible script/orthography variants of
the language to the Tesseract OCR agent.

### Test
Added many unit tests for a variety of language combinations, special
cases, and variants. For general testing, call partition functions with
any lang codes in the languages parameter (Tesseract or standard).

for example,
```
from unstructured.partition.auto import partition

elements = partition(filename="example-docs/layout-parser-paper.pdf", strategy="hi_res", languages=["en", "chi"])
print("\n\n".join([str(el) for el in elements]))
```
should supply eng+chi_sim+chi_sim_vert+chi_tra+chi_tra_vert to Tesseract
  • Loading branch information
shreyanid authored Sep 18, 2023
1 parent 3a07d1e commit eb8ce89
Show file tree
Hide file tree
Showing 14 changed files with 292 additions and 12 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Enhancements

* **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code.

### Features

### Fixes
Expand Down Expand Up @@ -62,7 +64,7 @@
* Update all connectors to use new downstream architecture
* New click type added to parse comma-delimited string inputs
* Some CLI options renamed

### Features

### Fixes
Expand Down
Binary file added example-docs/chi_sim_image.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example-docs/jpn-vert.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions requirements/base.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ requests
beautifulsoup4
emoji
dataclasses-json
python-iso639
2 changes: 2 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ nltk==3.8.1
# via -r requirements/base.in
packaging==23.1
# via marshmallow
python-iso639==2023.6.15
# via -r requirements/base.in
python-magic==0.4.27
# via -r requirements/base.in
regex==2023.8.8
Expand Down
2 changes: 1 addition & 1 deletion requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ nbformat==5.9.2
# jupyter-server
# nbclient
# nbconvert
nest-asyncio==1.5.7
nest-asyncio==1.5.8
# via ipykernel
nodeenv==1.8.0
# via pre-commit
Expand Down
4 changes: 2 additions & 2 deletions requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ click==8.1.7
# via
# -c requirements/base.txt
# flask
contourpy==1.1.0
contourpy==1.1.1
# via matplotlib
cssselect==1.2.0
# via premailer
Expand Down Expand Up @@ -148,7 +148,7 @@ psutil==5.9.5
# via visualdl
pyclipper==1.3.0.post5
# via unstructured-paddleocr
pycryptodome==3.18.0
pycryptodome==3.19.0
# via bce-python-sdk
pyparsing==3.0.9
# via
Expand Down
4 changes: 2 additions & 2 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ charset-normalizer==3.2.0
# requests
coloredlogs==15.0.1
# via onnxruntime
contourpy==1.1.0
contourpy==1.1.1
# via matplotlib
cryptography==41.0.3
# via pdfminer-six
Expand Down Expand Up @@ -124,7 +124,7 @@ pillow==10.0.1
# pytesseract
# torchvision
# unstructured-pytesseract
portalocker==2.7.0
portalocker==2.8.2
# via iopath
protobuf==4.23.4
# via
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest-azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile requirements/ingest-azure.in
#
adlfs==2023.8.0
adlfs==2023.9.0
# via -r requirements/ingest-azure.in
aiohttp==3.8.5
# via adlfs
Expand Down Expand Up @@ -71,7 +71,7 @@ multidict==6.0.4
# via
# aiohttp
# yarl
portalocker==2.7.0
portalocker==2.8.2
# via msal-extensions
pycparser==2.21
# via cffi
Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/partition/pdf-image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,20 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
assert point[1] is not math.nan


def test_partition_image_formats_languages_for_tesseract():
filename = "example-docs/jpn-vert.jpeg"
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
mock_process.assert_called_once_with(
filename,
is_image=True,
ocr_languages="jpn_vert",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)


def test_partition_image_warns_with_ocr_languages(caplog):
filename = "example-docs/layout-parser-paper-fast.jpg"
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")
Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,20 @@ def test_add_chunking_strategy_on_partition_pdf(
assert chunk_elements == chunks


def test_partition_pdf_formats_languages_for_tesseract():
filename = "example-docs/DA-1p.pdf"
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res", languages=["en"])
mock_process.assert_called_once_with(
filename,
is_image=False,
ocr_languages="eng",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)


def test_partition_pdf_warns_with_ocr_languages(caplog):
filename = "example-docs/chevron-page.pdf"
pdf.partition_pdf(filename=filename, strategy="hi_res", ocr_languages="eng")
Expand Down
16 changes: 16 additions & 0 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
assert elements[1].text.startswith("Zejiang Shen")


def test_auto_partition_formats_languages_for_tesseract():
filename = "example-docs/chi_sim_image.jpeg"
with patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
partition(filename, strategy="hi_res", languages=["zh"])
mock_process_file_with_model.assert_called_once_with(
filename,
is_image=True,
ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
ocr_mode="entire_page",
extract_tables=False,
model_name=None,
)


def test_auto_partition_warns_with_ocr_languages(caplog):
filename = "example-docs/chevron-page.pdf"
partition(filename=filename, strategy="hi_res", ocr_languages="eng")
Expand Down
47 changes: 47 additions & 0 deletions test_unstructured/partition/test_lang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unstructured.partition import lang


def test_prepare_languages_for_tesseract_with_one_language():
languages = ["en"]
assert lang.prepare_languages_for_tesseract(languages) == "eng"


def test_prepare_languages_for_tesseract_special_case():
languages = ["osd"]
assert lang.prepare_languages_for_tesseract(languages) == "osd"

languages = ["equ"]
assert lang.prepare_languages_for_tesseract(languages) == "equ"


def test_prepare_languages_for_tesseract_removes_empty_inputs():
languages = ["kbd", "es"]
assert lang.prepare_languages_for_tesseract(languages) == "spa+spa_old"


def test_prepare_languages_for_tesseract_includes_variants():
languages = ["chi"]
assert (
lang.prepare_languages_for_tesseract(languages)
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
)


def test_prepare_languages_for_tesseract_with_multiple_languages():
languages = ["ja", "afr", "en", "equ"]
assert lang.prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"


def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
languages = ["zzz", "chi"]
assert (
lang.prepare_languages_for_tesseract(languages)
== "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
)
assert "not a valid standard language code" in caplog.text


def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
languages = ["kbd", "eng"]
assert lang.prepare_languages_for_tesseract(languages) == "eng"
assert "not a language supported by Tesseract" in caplog.text
Loading

0 comments on commit eb8ce89

Please sign in to comment.