Skip to content

Commit

Permalink
enhancement: improve json detection by detect_filetype (#971)
Browse files Browse the repository at this point in the history
* update regex pattern

* improve json regex pattern checks and add test file

* update file name

* update tests and formatting

* update changelog and version
  • Loading branch information
Coniferish authored Jul 25, 2023
1 parent f7def03 commit f282a10
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 8 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.8.2-dev5
## 0.8.2-dev6

### Enhancements

Expand All @@ -9,6 +9,7 @@
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx
* Add `--encoding` directive to ingest
* Improve json detection by `detect_filetype`

### Features

Expand Down
10 changes: 10 additions & 0 deletions example-docs/fake-incomplete-json.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"name": "John Doe",
"age": 30,
"email": "[email protected]",
"is_student": true,
"address": {
"city": "New York",
"zipcode": "10001"
},
"hobbies": ["reading", "running", "cooking"]
3 changes: 3 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def pages(self):
("README.rst", FileType.RST),
("README.md", FileType.MD),
("fake.odt", FileType.ODT),
("fake-incomplete-json.txt", FileType.JSON),
],
)
def test_detect_filetype_from_filename(file, expected):
Expand All @@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
("fake-doc.rtf", FileType.RTF),
("spring-weather.html.json", FileType.JSON),
("fake.odt", FileType.ODT),
("fake-incomplete-json.txt", FileType.TXT),
],
)
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
Expand Down Expand Up @@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("fake-incomplete-json.txt", FileType.JSON),
],
)
def test_detect_filetype_from_file(file, expected):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.2-dev5" # pragma: no cover
__version__ = "0.8.2-dev6" # pragma: no cover
18 changes: 13 additions & 5 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import Element, PageBreak
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
from unstructured.partition.common import (
_add_element_metadata,
_remove_element_metadata,
Expand Down Expand Up @@ -300,9 +300,6 @@ def detect_filetype(
encoding = "utf-8"
formatted_encoding = format_encoding_str(encoding)

if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension)

# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
Expand All @@ -315,6 +312,9 @@ def detect_filetype(
if file and _check_eml_from_buffer(file=file) is True:
return FileType.EML

if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension)

# Safety catch
if mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
Expand Down Expand Up @@ -417,7 +417,15 @@ def _is_text_file_a_json(
):
"""Detects if a file that has a text/plain MIME type is a JSON file."""
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)

if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
return False

if not re.match(JSON_PATTERN, file_text):
return False

return True


def _count_commas(text: str):
Expand Down
15 changes: 14 additions & 1 deletion unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,17 @@
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
# format for document elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"

# (?s) dot all (including newline characters)
# \{(?=.*:) opening brace and at least one colon
# .*? any characters (non-greedy)
# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
# the string to handle cases where the JSON is cut off
# | or
# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
# or the closing bracket to handle cases where the JSON array is cut off
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"

# taken from https://stackoverflow.com/a/3845829/12406158
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"

0 comments on commit f282a10

Please sign in to comment.