From e4e25c9febe8e46874cbb91f0201cfaf5330a672 Mon Sep 17 00:00:00 2001
From: Wahab Alshahin <wahab.alshahin@gmail.com>
Date: Thu, 7 Sep 2023 17:30:18 -0400
Subject: [PATCH] Add clean_ligatures to core cleaners (#1326)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Background


[Ligatures](https://en.wikipedia.org/wiki/Ligature_(writing)#Ligatures_in_Unicode_(Latin_alphabets))
can sometimes show up during the text extraction process when they
should not. Very common examples of this are with the Latin `f` related
ligatures which can be **very subtle** to spot by eye (see example
below), but can wreak havoc later.

```python
"ﬀ": "ff",
"ﬁ": "fi",
"ﬂ": "fl",
"ﬃ": "ffi",
"ﬄ": "ffl",
```

Several libraries already do something like this. Most recently,
`pdfplumber` added this sort of capability as part of the text
extraction process, see https://github.com/jsvine/pdfplumber/issues/598

Instead of incorporating any sort of breaking change to the PDF text
processing in `unstructured`, it is best to add this as another cleaner
and allow users to opt in. In turn, the `clean_ligatures` method has
been added in this PR - with accompanying tests.

# Example

Here is an example PDF that causes the issue. For example: `Beneﬁts`,
which should be `Benefits`.


[example.pdf](https://github.com/Unstructured-IO/unstructured/files/12544344/example.pdf)

```bash
curl -X 'POST' \
    'https://api.unstructured.io/general/v0/general' \
    -H 'accept: application/json' \
    -H 'Content-Type: multipart/form-data' \
    -H 'unstructured-api-key: ${UNSTRUCTURED_API_KEY}' \
    -F 'files=@example.pdf' \
    -s | jq -C .
```

# Notes

An initial list of mappings was added with the most common ligatures.
There is some subjectivity to this, but this should be a relatively safe
starting set. Can always be expanded as needed.
---
 CHANGELOG.md                            |  1 +
 test_unstructured/cleaners/test_core.py | 26 +++++++++++++++++++++
 unstructured/cleaners/core.py           | 31 +++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd43366509..e8bc56ad25 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 ### Features
 
 * Add Jira Connector to be able to pull issues from a Jira organization
+* Add `clean_ligatures` function to expand ligatures in text
 
 ### Fixes
 
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index 3f2b7a7775..eec8edd2b9 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -62,6 +62,32 @@ def test_clean_ordered_bullets(text, expected):
     assert core.clean_ordered_bullets(text=text) == expected
 
 
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("The æther is a classic element.", "The aether is a classic element."),
+        ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
+        ("The buﬀer zone is there.", "The buffer zone is there."),
+        ("The ﬁle was found in the system.", "The file was found in the system."),
+        ("She had a ﬂower in her hair.", "She had a flower in her hair."),
+        ("The coﬃn was placed in the grave.", "The coffin was placed in the grave."),
+        ("The buﬄe zone was clearly marked.", "The buffle zone was clearly marked."),
+        ("The craﬅsman worked with dedication.", "The craftsman worked with dedication."),
+        ("The symbol ʪ is very rare.", "The symbol ls is very rare."),
+        ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
+        ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
+        ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
+        ("The poﬆman delivers mail daily.", "The postman delivers mail daily."),
+        (
+            "The symbol ʦ can be found in certain alphabets.",
+            "The symbol ts can be found in certain alphabets.",
+        ),
+    ],
+)
+def test_clean_ligatures(text, expected):
+    assert core.clean_ligatures(text=text) == expected
+
+
 @pytest.mark.parametrize(
     ("text", "expected"),
     [
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 49c206844b..70682af42c 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -68,6 +68,37 @@ def clean_ordered_bullets(text) -> str:
     return text_cl
 
 
+def clean_ligatures(text) -> str:
+    """Replaces ligatures with their most likely equivalent characters.
+
+    Example
+    -------
+    The beneﬁts -> The benefits
+    High quality ﬁnancial -> High quality financial
+    """
+    ligatures_map = {
+        "æ": "ae",
+        "Æ": "AE",
+        "ﬀ": "ff",
+        "ﬁ": "fi",
+        "ﬂ": "fl",
+        "ﬃ": "ffi",
+        "ﬄ": "ffl",
+        "ﬅ": "ft",
+        "ʪ": "ls",
+        "œ": "oe",
+        "Œ": "OE",
+        "ȹ": "qp",
+        "ﬆ": "st",
+        "ʦ": "ts",
+    }
+    cleaned_text: str = text
+    for k, v in ligatures_map.items():
+        cleaned_text = cleaned_text.replace(k, v)
+
+    return cleaned_text
+
+
 def group_bullet_paragraph(paragraph: str) -> list:
     """Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
     For example: