Skip to content

Commit

Permalink
refactor: simplifies JSON detection and add tests (#975)
Browse files Browse the repository at this point in the history
* refactor json detection

* version and changelog

* fix mock in test
  • Loading branch information
MthwRobinson authored Jul 25, 2023
1 parent f282a10 commit d694cd5
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 19 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.8.2-dev6
## 0.8.2-dev7

### Enhancements

* Additional tests and refactor of JSON detection.
* Update functionality to retrieve image metadata from a page for `document_to_element_list`
* Links are now tracked in `partition_html` output.
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
* Add `min_partition` kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx
* Add `--encoding` directive to ingest
Expand Down
4 changes: 2 additions & 2 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def pages(self):
("README.rst", FileType.RST),
("README.md", FileType.MD),
("fake.odt", FileType.ODT),
("fake-incomplete-json.txt", FileType.JSON),
("fake-incomplete-json.txt", FileType.TXT),
],
)
def test_detect_filetype_from_filename(file, expected):
Expand Down Expand Up @@ -141,7 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("fake-incomplete-json.txt", FileType.JSON),
("fake-incomplete-json.txt", FileType.TXT),
],
)
def test_detect_filetype_from_file(file, expected):
Expand Down
15 changes: 14 additions & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,19 @@ def test_auto_partition_json_from_filename():
assert json_data == json_elems


def test_auto_partition_json_raises_with_unprocessable_json(tmpdir):
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
# per the Unstructured ISD format
text = '{"hi": "there"}'

filename = os.path.join(tmpdir, "unprocessable.json")
with open(filename, "w") as f:
f.write(text)

with pytest.raises(ValueError):
partition(filename=filename)


@pytest.mark.xfail(
reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
)
Expand Down Expand Up @@ -525,7 +538,7 @@ def test_auto_partition_odt_from_file():
@pytest.mark.parametrize(
("content_type", "routing_func", "expected"),
[
("application/json", "json", "application/json"),
("text/csv", "csv", "text/csv"),
("text/html", "html", "text/html"),
("jdsfjdfsjkds", "pdf", None),
],
Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/partition/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,17 @@ def test_partition_json_from_text_exclude_metadata(filename: str):

for i in range(len(test_elements)):
assert any(test_elements[i].metadata.to_dict()) is False


def test_partition_json_raises_with_unprocessable_json():
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
# per the Unstructured ISD format
text = '{"hi": "there"}'
with pytest.raises(ValueError):
partition_json(text=text)


def test_partition_json_raises_with_invalid_json():
text = '[{"hi": "there"}]]'
with pytest.raises(ValueError):
partition_json(text=text)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.2-dev6" # pragma: no cover
__version__ = "0.8.2-dev7" # pragma: no cover
23 changes: 16 additions & 7 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import inspect
import json
import os
import re
import zipfile
Expand All @@ -11,7 +12,7 @@
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import Element, PageBreak
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import (
_add_element_metadata,
_remove_element_metadata,
Expand Down Expand Up @@ -417,15 +418,23 @@ def _is_text_file_a_json(
):
"""Detects if a file that has a text/plain MIME type is a JSON file."""
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)

if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
try:
json.loads(file_text)
return True
except json.JSONDecodeError:
return False

if not re.match(JSON_PATTERN, file_text):
return False

return True
def is_json_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None


def _count_commas(text: str):
Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
STR_TO_FILETYPE,
FileType,
detect_filetype,
is_json_processable,
)
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
Expand Down Expand Up @@ -227,6 +228,11 @@ def partition(
**kwargs,
)
elif filetype == FileType.JSON:
if not is_json_processable(filename=filename, file=file):
raise ValueError(
"Detected a JSON file that does not conform to the Unstructured schema. "
"partition_json currently only processes serialized Unstructured output.",
)
elements = partition_json(filename=filename, file=file, **kwargs)
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
elements = partition_xlsx(filename=filename, file=file, **kwargs)
Expand Down
15 changes: 9 additions & 6 deletions unstructured/partition/json.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import re
from typing import IO, List, Optional

from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
is_json_processable,
)
from unstructured.partition.common import exactly_one
from unstructured.staging.base import dict_to_elements

Expand Down Expand Up @@ -48,9 +50,10 @@ def partition_json(
elif text is not None:
file_text = str(text)

# NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
if not re.match(LIST_OF_DICTS_PATTERN, file_text):
raise ValueError("Json schema does not match the Unstructured schema")
if not is_json_processable(file_text=file_text):
raise ValueError(
"JSON cannot be partitioned. Schema does not match the Unstructured schema.",
)

try:
dict = json.loads(file_text)
Expand Down

0 comments on commit d694cd5

Please sign in to comment.