enhancement: improve json detection by detect_filetype (#971)

* update regex pattern * improve json regex pattern checks and add test file * update file name * update tests and formatting * update changelog and version
Unstructured-IO · Jul 25, 2023 · f282a10 · f282a10
1 parent f7def03
commit f282a10
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.8.2-dev5
+## 0.8.2-dev6
 
 ### Enhancements
 
@@ -9,6 +9,7 @@
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 * Add `--encoding` directive to ingest
+* Improve json detection by `detect_filetype`
 
 ### Features
 

diff --git a/example-docs/fake-incomplete-json.txt b/example-docs/fake-incomplete-json.txt
@@ -0,0 +1,10 @@
+{
+  "name": "John Doe",
+  "age": 30,
+  "email": "[email protected]",
+  "is_student": true,
+  "address": {
+    "city": "New York",
+    "zipcode": "10001"
+  },
+  "hobbies": ["reading", "running", "cooking"]
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -77,6 +77,7 @@ def pages(self):
         ("README.rst", FileType.RST),
         ("README.md", FileType.MD),
         ("fake.odt", FileType.ODT),
+        ("fake-incomplete-json.txt", FileType.JSON),
     ],
 )
 def test_detect_filetype_from_filename(file, expected):
@@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("fake-doc.rtf", FileType.RTF),
         ("spring-weather.html.json", FileType.JSON),
         ("fake.odt", FileType.ODT),
+        ("fake-incomplete-json.txt", FileType.TXT),
     ],
 )
 def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
@@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
         ("stanley-cups.tsv", FileType.TSV),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
+        ("fake-incomplete-json.txt", FileType.JSON),
     ],
 )
 def test_detect_filetype_from_file(file, expected):

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.8.2-dev5"  # pragma: no cover
+__version__ = "0.8.2-dev6"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -11,7 +11,7 @@
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import Element, PageBreak
 from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
-from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
+from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
 from unstructured.partition.common import (
     _add_element_metadata,
     _remove_element_metadata,
@@ -300,9 +300,6 @@ def detect_filetype(
             encoding = "utf-8"
         formatted_encoding = format_encoding_str(encoding)
 
-        if extension in PLAIN_TEXT_EXTENSIONS:
-            return EXT_TO_FILETYPE.get(extension)
-
         # NOTE(crag): for older versions of the OS libmagic package, such as is currently
         # installed on the Unstructured docker image, .json files resolve to "text/plain"
         # rather than "application/json". this corrects for that case.
@@ -315,6 +312,9 @@ def detect_filetype(
         if file and _check_eml_from_buffer(file=file) is True:
             return FileType.EML
 
+        if extension in PLAIN_TEXT_EXTENSIONS:
+            return EXT_TO_FILETYPE.get(extension)
+
         # Safety catch
         if mime_type in STR_TO_FILETYPE:
             return STR_TO_FILETYPE[mime_type]
@@ -417,7 +417,15 @@ def _is_text_file_a_json(
 ):
     """Detects if a file that has a text/plain MIME type is a JSON file."""
     file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
-    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
+    text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
+
+    if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
+        return False
+
+    if not re.match(JSON_PATTERN, file_text):
+        return False
+
+    return True
 
 
 def _count_commas(text: str):

diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
@@ -108,4 +108,17 @@
 # NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
 # format for document elements
 LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
-JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"
+
+# (?s) dot all (including newline characters)
+# \{(?=.*:) opening brace and at least one colon
+# .*? any characters (non-greedy)
+# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
+# the string to handle cases where the JSON is cut off
+# | or
+# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
+# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
+# or the closing bracket to handle cases where the JSON array is cut off
+JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
+
+# taken from https://stackoverflow.com/a/3845829/12406158
+VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"