refactor: simplifies JSON detection and add tests (#975)

* refactor json detection * version and changelog * fix mock in test
Unstructured-IO · Jul 25, 2023 · d694cd5 · d694cd5
1 parent f282a10
commit d694cd5
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.8.2-dev6
+## 0.8.2-dev7
 
 ### Enhancements
 
+* Additional tests and refactor of JSON detection.
 * Update functionality to retrieve image metadata from a page for `document_to_element_list`
 * Links are now tracked in `partition_html` output.
 * Set the file's current position to the beginning after reading the file in `convert_to_bytes`
-* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
+* Add `min_partition` kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 * Add `--encoding` directive to ingest

diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -77,7 +77,7 @@ def pages(self):
         ("README.rst", FileType.RST),
         ("README.md", FileType.MD),
         ("fake.odt", FileType.ODT),
-        ("fake-incomplete-json.txt", FileType.JSON),
+        ("fake-incomplete-json.txt", FileType.TXT),
     ],
 )
 def test_detect_filetype_from_filename(file, expected):
@@ -141,7 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
         ("stanley-cups.tsv", FileType.TSV),
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
-        ("fake-incomplete-json.txt", FileType.JSON),
+        ("fake-incomplete-json.txt", FileType.TXT),
     ],
 )
 def test_detect_filetype_from_file(file, expected):

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -220,6 +220,19 @@ def test_auto_partition_json_from_filename():
     assert json_data == json_elems
 
 
+def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
+    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
+    # per the Unstructured ISD format
+    text = '{"hi": "there"}'
+
+    filename = os.path.join(tmpdir, "unprocessable.json")
+    with open(filename, "w") as f:
+        f.write(text)
+
+    with pytest.raises(ValueError):
+        partition(filename=filename)
+
+
 @pytest.mark.xfail(
     reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
 )
@@ -525,7 +538,7 @@ def test_auto_partition_odt_from_file():
 @pytest.mark.parametrize(
     ("content_type", "routing_func", "expected"),
     [
-        ("application/json", "json", "application/json"),
+        ("text/csv", "csv", "text/csv"),
         ("text/html", "html", "text/html"),
         ("jdsfjdfsjkds", "pdf", None),
     ],

diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py
@@ -204,3 +204,17 @@ def test_partition_json_from_text_exclude_metadata(filename: str):
 
     for i in range(len(test_elements)):
         assert any(test_elements[i].metadata.to_dict()) is False
+
+
+def test_partition_json_raises_with_unprocessable_json():
+    # NOTE(robinson) - This is unprocessable because it is not a list of dicts,
+    # per the Unstructured ISD format
+    text = '{"hi": "there"}'
+    with pytest.raises(ValueError):
+        partition_json(text=text)
+
+
+def test_partition_json_raises_with_invalid_json():
+    text = '[{"hi": "there"}]]'
+    with pytest.raises(ValueError):
+        partition_json(text=text)
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.8.2-dev6"  # pragma: no cover
+__version__ = "0.8.2-dev7"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import inspect
+import json
 import os
 import re
 import zipfile
@@ -11,7 +12,7 @@
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import Element, PageBreak
 from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
-from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
+from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import (
     _add_element_metadata,
     _remove_element_metadata,
@@ -417,15 +418,23 @@ def _is_text_file_a_json(
 ):
     """Detects if a file that has a text/plain MIME type is a JSON file."""
     file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
-    text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
-
-    if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
+    try:
+        json.loads(file_text)
+        return True
+    except json.JSONDecodeError:
         return False
 
-    if not re.match(JSON_PATTERN, file_text):
-        return False
 
-    return True
+def is_json_processable(
+    filename: Optional[str] = None,
+    file: Optional[IO[bytes]] = None,
+    file_text: Optional[str] = None,
+    encoding: Optional[str] = "utf-8",
+) -> bool:
+    exactly_one(filename=filename, file=file, file_text=file_text)
+    if file_text is None:
+        file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
+    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
 
 
 def _count_commas(text: str):

diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -9,6 +9,7 @@
     STR_TO_FILETYPE,
     FileType,
     detect_filetype,
+    is_json_processable,
 )
 from unstructured.logger import logger
 from unstructured.partition.common import exactly_one
@@ -227,6 +228,11 @@ def partition(
             **kwargs,
         )
     elif filetype == FileType.JSON:
+        if not is_json_processable(filename=filename, file=file):
+            raise ValueError(
+                "Detected a JSON file that does not conform to the Unstructured schema. "
+                "partition_json currently only processes serialized Unstructured output.",
+            )
         elements = partition_json(filename=filename, file=file, **kwargs)
     elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
         elements = partition_xlsx(filename=filename, file=file, **kwargs)

diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py
@@ -1,10 +1,12 @@
 import json
-import re
 from typing import IO, List, Optional
 
 from unstructured.documents.elements import Element, process_metadata
-from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
+from unstructured.file_utils.filetype import (
+    FileType,
+    add_metadata_with_filetype,
+    is_json_processable,
+)
 from unstructured.partition.common import exactly_one
 from unstructured.staging.base import dict_to_elements
 
@@ -48,9 +50,10 @@ def partition_json(
     elif text is not None:
         file_text = str(text)
 
-    # NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
-    if not re.match(LIST_OF_DICTS_PATTERN, file_text):
-        raise ValueError("Json schema does not match the Unstructured schema")
+    if not is_json_processable(file_text=file_text):
+        raise ValueError(
+            "JSON cannot be partitioned. Schema does not match the Unstructured schema.",
+        )
 
     try:
         dict = json.loads(file_text)