build(deps): bump unstructured-inference==0.5.13 (#1141)

Bump to unstructured-inference==0.5.13, which includes: Fix extracted image elements being included in layout merge, addresses the issue where an entire-page image in a PDF was not passed to the layout model when using hi_res.
Unstructured-IO · Aug 17, 2023 · dd0f582 · dd0f582
1 parent 9f7bd61
commit dd0f582
Show file tree

Hide file tree

Showing 12 changed files with 865 additions and 534 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## 0.10.2
+
+### Enhancements
+* Bump unstructured-inference==0.5.13:
+  - Fix extracted image elements being included in layout merge, addresses the issue
+    where an entire-page image in a PDF was not passed to the layout model when using hi_res.
+
+### Features
+
+### Fixes
+
 ## 0.10.1
 
 ### Enhancements

diff --git a/requirements/constraints.in b/requirements/constraints.in
@@ -26,4 +26,4 @@ Pillow<10.0.0
 # AttributeError: 'ResourcePath' object has no attribute 'collection'
 Office365-REST-Python-Client<2.4.3
 # NOTE(christine) Pinned to set the `unstructured-inference` version
-unstructured-inference==0.5.12
+unstructured-inference==0.5.13
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -205,7 +205,7 @@ typing-extensions==4.7.1
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.5.12
+unstructured-inference==0.5.13
     # via
     #   -c requirements/constraints.in
     #   -r requirements/extra-pdf-image.in

diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile requirements/ingest-confluence.in
 #
-atlassian-python-api==3.40.1
+atlassian-python-api==3.41.0
     # via -r requirements/ingest-confluence.in
 certifi==2023.7.22
     # via

diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
diff --git a/..._unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/..._unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -10,34 +10,34 @@
     "text": "Data in Brief 22 (2019) 451–457"
   },
   {
-    "type": "Image",
-    "element_id": "70d50409ea726a2789ebbd004bec31f4",
+    "type": "UncategorizedText",
+    "element_id": "869adddb184177031536477262e0dde0",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
+    "text": "Contents lists available at ScienceDirect"
   },
   {
     "type": "UncategorizedText",
-    "element_id": "869adddb184177031536477262e0dde0",
+    "element_id": "e6fa42b5b4d85001b900e47c050b645b",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Contents lists available at ScienceDirect"
+    "text": "Data in Brief"
   },
   {
-    "type": "UncategorizedText",
-    "element_id": "e6fa42b5b4d85001b900e47c050b645b",
+    "type": "NarrativeText",
+    "element_id": "9234133787d0a6b3976b16569c0b5cf3",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Data in Brief"
+    "text": "journal homepage: www.elsevier.com/locate/dib"
   },
   {
     "type": "UncategorizedText",

diff --git a/..._unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/..._unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -10,34 +10,34 @@
     "text": "Data in Brief 22 (2019) 484–487"
   },
   {
-    "type": "Image",
-    "element_id": "70d50409ea726a2789ebbd004bec31f4",
+    "type": "UncategorizedText",
+    "element_id": "869adddb184177031536477262e0dde0",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
+    "text": "Contents lists available at ScienceDirect"
   },
   {
     "type": "UncategorizedText",
-    "element_id": "869adddb184177031536477262e0dde0",
+    "element_id": "e6fa42b5b4d85001b900e47c050b645b",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Contents lists available at ScienceDirect"
+    "text": "Data in Brief"
   },
   {
-    "type": "UncategorizedText",
-    "element_id": "e6fa42b5b4d85001b900e47c050b645b",
+    "type": "NarrativeText",
+    "element_id": "9234133787d0a6b3976b16569c0b5cf3",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "Data in Brief"
+    "text": "journal homepage: www.elsevier.com/locate/dib"
   },
   {
     "type": "UncategorizedText",

diff --git a/...ured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/...ured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -852,7 +852,7 @@
   },
   {
     "type": "FigureCaption",
-    "element_id": "185e67615d123b35d38ea72e0cdb6d99",
+    "element_id": "d21661161ae2c8dc39e96ee5c660704b",
     "metadata": {
       "data_source": {},
       "filetype": "application/pdf",
@@ -960,16 +960,6 @@
     },
     "text": "LayoutParser provides a uniﬁed interface for existing OCR tools. Though there are many OCR tools available, they are usually conﬁgured diﬀerently with distinct APIs or protocols for using them. It can be ineﬃcient to add new OCR tools into an existing pipeline, and diﬃcult to make direct comparisons among the available tools to ﬁnd the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it eﬀortless to switch, evaluate, and compare diﬀerent OCR modules:"
   },
-  {
-    "type": "Image",
-    "element_id": "65ac0f9ae348b12ed9484b8af7296617",
-    "metadata": {
-      "data_source": {},
-      "filetype": "application/pdf",
-      "page_number": 7
-    },
-    "text": "ocr_agent = lp.TesseractAgent ()pOi"
-  },
   {
     "type": "ListItem",
     "element_id": "bebbb4e94f1f97edeb5b96e252720a93",
@@ -1351,6 +1341,26 @@
     },
     "text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position"
   },
+  {
+    "type": "NarrativeText",
+    "element_id": "aed1b21a388cefaa841f20f48d19ca98",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 9
+    },
+    "text": "Mode I: Showing Layout on the Original Image"
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "915bc5f1403e01b56e77300d9354fded",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 9
+    },
+    "text": "Mode Il: Drawing OCR'd Text at the Correspoding Position"
+  },
   {
     "type": "NarrativeText",
     "element_id": "cc8ad6e0f933633a37b82200e6724f9e",