Unstructured-IO · benjats07 · Oct 5, 2023 · Sep 21, 2023 · Sep 22, 2023 · Sep 26, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -147,7 +147,7 @@ jobs:
         tesseract --version
         # FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again
         make install-ci
-        make test CI=true
+        make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
         make check-coverage
 
   test_unit_no_extras:
@@ -419,4 +419,4 @@ jobs:
           source .venv/bin/activate
           echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
           make docker-build
-          make docker-test CI=true
+          make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -51,6 +51,7 @@
 
 * **Adds `links` metadata in `partition_pdf` for `fast` strategy.** Problem: PDF files contain rich information and hyperlink that Unstructured did not captured earlier. Feature: `partition_pdf` now can capture embedded links within the file along with its associated text and page number. Importance: Providing depth in extracted elements give user a better understanding and richer context of documents. This also enables user to map to other elements within the document if the hyperlink is refered internally.
 * **Adds the embedding module to be able to embed Elements** Problem: Many NLP applications require the ability to represent parts of documents in a semantic way. Until now, Unstructured did not have text embedding ability within the core library. Feature: This embedding module is able to track embeddings related data with a class, embed a list of elements, and return an updated list of Elements with the *embeddings* property. The module is also able to embed query strings. Importance: Ability to embed documents or parts of documents will enable users to make use of these semantic representations in different NLP applications, such as search, retrieval, and retrieval augmented generation.
+* **Adds data_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. 
 
 ### Fixes
 
@@ -67,6 +68,10 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text
 * **Fixes occasionally SIGABTR when writing table with `deltalake` on Linux** Problem: occasionally on Linux ingest can throw a `SIGABTR` when writing `deltalake` table even though the table was written correctly. Fix: put the writing function into a `Process` to ensure its execution to the fullest extent before returning to the main process. Importance: Improves stability of connectors using `deltalake`
 
 
+* **Fix badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class
+should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class 
+allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas.
+
 ## 0.10.16
 
 ### Enhancements

diff --git a/Makefile b/Makefile
@@ -241,11 +241,13 @@ uninstall-project-local:
 #################
 
 export CI ?= false
+export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
 
 ## test:                    runs all unittests
 .PHONY: test
 test:
-	PYTHONPATH=. CI=$(CI) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
+	PYTHONPATH=. CI=$(CI) \
+	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
 
 .PHONY: test-unstructured-api-unit
 test-unstructured-api-unit:
@@ -254,7 +256,8 @@ test-unstructured-api-unit:
 .PHONY: test-no-extras
 # TODO(newelh) Add json test when fixed
 test-no-extras:
-	PYTHONPATH=. CI=$(CI) pytest \
+	PYTHONPATH=. CI=$(CI) \
+		UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) pytest \
 		test_${PACKAGE_NAME}/partition/test_text.py \
 		test_${PACKAGE_NAME}/partition/test_email.py \
 		test_${PACKAGE_NAME}/partition/test_html_partition.py \
@@ -394,7 +397,9 @@ docker-test:
 	-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
 	$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
 	$(DOCKER_IMAGE) \
-	bash -c "CI=$(CI) pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured"
+	bash -c "CI=$(CI) \
+	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
+	pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured"
 
 .PHONY: docker-smoke-test
 docker-smoke-test:

diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py
@@ -13,6 +13,7 @@
 from unstructured.documents.elements import Table
 from unstructured.partition.csv import partition_csv
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 EXPECTED_FILETYPE = "text/csv"
@@ -55,12 +56,13 @@ def test_partition_csv_from_file(filename, expected_text, expected_table):
     f_path = f"example-docs/{filename}"
     with open(f_path, "rb") as f:
         elements = partition_csv(file=f)
-
     assert clean_extra_whitespace(elements[0].text) == expected_text
     assert isinstance(elements[0], Table)
     assert elements[0].metadata.text_as_html == expected_table
     assert elements[0].metadata.filetype == EXPECTED_FILETYPE
     assert elements[0].metadata.filename is None
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"csv"}
 
 
 def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):

diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -25,6 +25,7 @@
 from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import _DocxPartitioner, partition_docx
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 
@@ -107,6 +108,8 @@ def test_partition_docx_from_filename(
     assert elements[0].metadata.page_number is None
     for element in elements:
         assert element.metadata.filename == "mock_document.docx"
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"docx"}
 
 
 def test_partition_docx_from_filename_with_metadata_filename(mock_document, tmpdir):

diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
@@ -5,6 +5,7 @@
 from unstructured.documents.elements import Table, Text
 from unstructured.partition.epub import partition_epub
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -33,6 +34,8 @@ def test_partition_epub_from_filename():
         assert element.metadata.section is not None
         all_sections.add(element.metadata.section)
     assert all_sections == expected_sections
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"epub"}
 
 
 def test_partition_epub_from_filename_returns_table_in_elements():

diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/markdown/test_md.py
@@ -9,6 +9,7 @@
 from unstructured.documents.elements import Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -21,6 +22,8 @@ def test_partition_md_from_filename():
     assert len(elements) > 0
     for element in elements:
         assert element.metadata.filename == "README.md"
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"md"}
 
 
 def test_partition_md_from_filename_returns_uns_elements():

diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/msg/test_msg.py
@@ -14,6 +14,7 @@
 from unstructured.partition.json import partition_json
 from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
 from unstructured.partition.text import partition_text
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -59,6 +60,8 @@ def test_partition_msg_from_filename():
     )
     for element in elements:
         assert element.metadata.filename == "fake-email.msg"
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"msg"}
 
 
 def test_partition_msg_from_filename_returns_uns_elements():

diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py
@@ -5,6 +5,7 @@
 from unstructured.documents.elements import Table, TableChunk, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.odt import partition_odt
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -26,6 +27,10 @@ def test_partition_odt_from_filename():
     ]
     for element in elements:
         assert element.metadata.filename == "fake.odt"
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {
+            "docx",
+        }  # this file is processed by docx backend
 
 
 def test_partition_odt_from_filename_with_metadata_filename():

diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -10,6 +10,7 @@
 from unstructured.chunking.title import chunk_by_title
 from unstructured.partition import image, pdf
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -245,6 +246,8 @@ def test_partition_image_default_strategy_hi_res():
     assert elements[0].metadata.coordinates is not None
     assert elements[0].metadata.detection_class_prob is not None
     assert isinstance(elements[0].metadata.detection_class_prob, float)
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"image"}
 
 
 def test_partition_image_metadata_date(

diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -18,6 +18,7 @@
 )
 from unstructured.partition import pdf, strategies
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 
@@ -114,15 +115,16 @@ def test_partition_pdf_local_raises_with_no_filename():
 
 @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
 @pytest.mark.parametrize(
-    ("strategy", "expected"),
+    ("strategy", "expected", "origin"),
     # fast: can't capture the "intentionally left blank page" page
     # others: will ignore the actual blank page
-    [("fast", {1, 4}), ("hi_res", {1, 3, 4}), ("ocr_only", {1, 3, 4})],
+    [("fast", {1, 4}, "pdfminer"), ("hi_res", {1, 3, 4}, "pdf"), ("ocr_only", {1, 3, 4}, "OCR")],
 )
 def test_partition_pdf(
     file_mode,
     strategy,
     expected,
+    origin,
     filename="example-docs/layout-parser-paper-with-empty-pages.pdf",
 ):
     # Test that the partition_pdf function can handle filename
@@ -131,6 +133,8 @@ def _test(result):
         assert len(result) > 10
         # check that the pdf has multiple different page numbers
         assert {element.metadata.page_number for element in result} == expected
+        if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+            assert {element.metadata.detection_origin for element in result} == {origin}
 
     if file_mode == "filename":
         result = pdf.partition_pdf(filename=filename, strategy=strategy)

diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py
@@ -7,6 +7,7 @@
 from unstructured.documents.elements import ListItem, NarrativeText, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.ppt import partition_ppt
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -28,6 +29,8 @@ def test_partition_ppt_from_filename():
     assert elements == EXPECTED_PPT_OUTPUT
     for element in elements:
         assert element.metadata.filename == "fake-power-point.ppt"
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"pptx"}
 
 
 def test_partition_ppt_from_filename_with_metadata_filename():

diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -13,6 +13,7 @@
     partition_text,
     split_content_to_fit_max,
 )
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -67,6 +68,8 @@ def test_partition_text_from_filename(filename, encoding):
     assert elements == EXPECTED_OUTPUT
     for element in elements:
         assert element.metadata.filename == filename
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"text"}
 
 
 def test_partition_text_from_filename_with_metadata_filename():

diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py
@@ -6,6 +6,7 @@
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import NarrativeText, Title
 from unstructured.partition.json import partition_json
+from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
 from unstructured.partition.xml import partition_xml
 from unstructured.staging.base import elements_to_json
 
@@ -22,6 +23,8 @@ def test_partition_xml_from_filename(filename):
 
     assert elements[0].text == "United States"
     assert elements[0].metadata.filename == filename
+    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+        assert {element.metadata.detection_origin for element in elements} == {"xml"}
 
 
 def test_partition_xml_from_filename_with_metadata_filename():

diff --git a/...ctured_ingest/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json b/...ctured_ingest/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json