From e4e25c9febe8e46874cbb91f0201cfaf5330a672 Mon Sep 17 00:00:00 2001 From: Wahab Alshahin Date: Thu, 7 Sep 2023 17:30:18 -0400 Subject: [PATCH] Add clean_ligatures to core cleaners (#1326) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Background [Ligatures](https://en.wikipedia.org/wiki/Ligature_(writing)#Ligatures_in_Unicode_(Latin_alphabets)) can sometimes show up during the text extraction process when they should not. Very common examples of this are with the Latin `f` related ligatures which can be **very subtle** to spot by eye (see example below), but can wreak havoc later. ```python "ff": "ff", "fi": "fi", "fl": "fl", "ffi": "ffi", "ffl": "ffl", ``` Several libraries already do something like this. Most recently, `pdfplumber` added this sort of capability as part of the text extraction process, see https://github.com/jsvine/pdfplumber/issues/598 Instead of incorporating any sort of breaking change to the PDF text processing in `unstructured`, it is best to add this as another cleaner and allow users to opt in. In turn, the `clean_ligatures` method has been added in this PR - with accompanying tests. # Example Here is an example PDF that causes the issue. For example: `Benefits`, which should be `Benefits`. [example.pdf](https://github.com/Unstructured-IO/unstructured/files/12544344/example.pdf) ```bash curl -X 'POST' \ 'https://api.unstructured.io/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ -H 'unstructured-api-key: ${UNSTRUCTURED_API_KEY}' \ -F 'files=@example.pdf' \ -s | jq -C . ``` # Notes An initial list of mappings was added with the most common ligatures. There is some subjectivity to this, but this should be a relatively safe starting set. Can always be expanded as needed. --- CHANGELOG.md | 1 + test_unstructured/cleaners/test_core.py | 26 +++++++++++++++++++++ unstructured/cleaners/core.py | 31 +++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd43366509..e8bc56ad25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Features * Add Jira Connector to be able to pull issues from a Jira organization +* Add `clean_ligatures` function to expand ligatures in text ### Fixes diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 3f2b7a7775..eec8edd2b9 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -62,6 +62,32 @@ def test_clean_ordered_bullets(text, expected): assert core.clean_ordered_bullets(text=text) == expected +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("The æther is a classic element.", "The aether is a classic element."), + ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"), + ("The buffer zone is there.", "The buffer zone is there."), + ("The file was found in the system.", "The file was found in the system."), + ("She had a flower in her hair.", "She had a flower in her hair."), + ("The coffin was placed in the grave.", "The coffin was placed in the grave."), + ("The buffle zone was clearly marked.", "The buffle zone was clearly marked."), + ("The craſtsman worked with dedication.", "The craftsman worked with dedication."), + ("The symbol ʪ is very rare.", "The symbol ls is very rare."), + ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."), + ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"), + ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."), + ("The postman delivers mail daily.", "The postman delivers mail daily."), + ( + "The symbol ʦ can be found in certain alphabets.", + "The symbol ts can be found in certain alphabets.", + ), + ], +) +def test_clean_ligatures(text, expected): + assert core.clean_ligatures(text=text) == expected + + @pytest.mark.parametrize( ("text", "expected"), [ diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 49c206844b..70682af42c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -68,6 +68,37 @@ def clean_ordered_bullets(text) -> str: return text_cl +def clean_ligatures(text) -> str: + """Replaces ligatures with their most likely equivalent characters. + + Example + ------- + The benefits -> The benefits + High quality financial -> High quality financial + """ + ligatures_map = { + "æ": "ae", + "Æ": "AE", + "ff": "ff", + "fi": "fi", + "fl": "fl", + "ffi": "ffi", + "ffl": "ffl", + "ſt": "ft", + "ʪ": "ls", + "œ": "oe", + "Œ": "OE", + "ȹ": "qp", + "st": "st", + "ʦ": "ts", + } + cleaned_text: str = text + for k, v in ligatures_map.items(): + cleaned_text = cleaned_text.replace(k, v) + + return cleaned_text + + def group_bullet_paragraph(paragraph: str) -> list: """Groups paragraphs with bullets that have line breaks for visual/formatting purposes. For example: