diff --git a/CHANGELOG.md b/CHANGELOG.md index b3c2d28752..3eb3a1c45a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.8.2-dev5 +## 0.8.2-dev6 ### Enhancements @@ -9,6 +9,7 @@ * set the file's current position to the beginning after reading the file in `convert_to_bytes` * Add slide notes to pptx * Add `--encoding` directive to ingest +* Improve json detection by `detect_filetype` ### Features diff --git a/example-docs/fake-incomplete-json.txt b/example-docs/fake-incomplete-json.txt new file mode 100644 index 0000000000..3d487461b2 --- /dev/null +++ b/example-docs/fake-incomplete-json.txt @@ -0,0 +1,10 @@ +{ + "name": "John Doe", + "age": 30, + "email": "johndoe@example.com", + "is_student": true, + "address": { + "city": "New York", + "zipcode": "10001" + }, + "hobbies": ["reading", "running", "cooking"] diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 105bcfe029..4a0ac9424e 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -77,6 +77,7 @@ def pages(self): ("README.rst", FileType.RST), ("README.md", FileType.MD), ("fake.odt", FileType.ODT), + ("fake-incomplete-json.txt", FileType.JSON), ], ) def test_detect_filetype_from_filename(file, expected): @@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected): ("fake-doc.rtf", FileType.RTF), ("spring-weather.html.json", FileType.JSON), ("fake.odt", FileType.ODT), + ("fake-incomplete-json.txt", FileType.TXT), ], ) def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected): @@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("stanley-cups.tsv", FileType.TSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), + ("fake-incomplete-json.txt", FileType.JSON), ], ) def test_detect_filetype_from_file(file, expected): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4ba261c4ad..1041e80727 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev5" # pragma: no cover +__version__ = "0.8.2-dev6" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 80cf56cc65..78372af831 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -11,7 +11,7 @@ from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import Element, PageBreak from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN +from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS from unstructured.partition.common import ( _add_element_metadata, _remove_element_metadata, @@ -300,9 +300,6 @@ def detect_filetype( encoding = "utf-8" formatted_encoding = format_encoding_str(encoding) - if extension in PLAIN_TEXT_EXTENSIONS: - return EXT_TO_FILETYPE.get(extension) - # NOTE(crag): for older versions of the OS libmagic package, such as is currently # installed on the Unstructured docker image, .json files resolve to "text/plain" # rather than "application/json". this corrects for that case. @@ -315,6 +312,9 @@ def detect_filetype( if file and _check_eml_from_buffer(file=file) is True: return FileType.EML + if extension in PLAIN_TEXT_EXTENSIONS: + return EXT_TO_FILETYPE.get(extension) + # Safety catch if mime_type in STR_TO_FILETYPE: return STR_TO_FILETYPE[mime_type] @@ -417,7 +417,15 @@ def _is_text_file_a_json( ): """Detects if a file that has a text/plain MIME type is a JSON file.""" file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding) - return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None + text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text) + + if not re.match(VALID_JSON_CHARACTERS, text_without_strings): + return False + + if not re.match(JSON_PATTERN, file_text): + return False + + return True def _count_commas(text: str): diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 8fbba1234f..d582312c3b 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -108,4 +108,17 @@ # NOTE(robinson) - Used to detect if text is in the expected "list of dicts" # format for document elements LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?" -JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$" + +# (?s) dot all (including newline characters) +# \{(?=.*:) opening brace and at least one colon +# .*? any characters (non-greedy) +# (?:\}|$) non-capturing group that matches either the closing brace } or the end of +# the string to handle cases where the JSON is cut off +# | or +# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array +# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma, +# or the closing bracket to handle cases where the JSON array is cut off +JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])" + +# taken from https://stackoverflow.com/a/3845829/12406158 +VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"