Merge branch 'main' into fix/1209-tweak-xycut-ordering-output

Unstructured-IO · Oct 4, 2023 · a90b441 · a90b441
2 parents ae25d42 + 19d8bff
commit a90b441
Show file tree

Hide file tree

Showing 18 changed files with 163 additions and 16,909 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.19-dev9
+## 0.10.19-dev10
 
 ### Enhancements
 
@@ -8,8 +8,7 @@
 * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
 * **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=<n>` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length <n> characters. This means partitioned Table results are ready for use in downstream applications without any post processing.
 * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio).
-
-### Features 
+* **change default `hi_res` model for pdf/image partition to `yolox`** Now partitioning pdf/image using `hi_res` strategy utilizes `yolox_quantized` model isntead of `detectron2_onnx` model. This new default model has better recall for tables and produces more detailed categories for elements.
 
 ### Features
 

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -108,7 +108,7 @@ When elements are extracted from PDFs or images, it may be useful to get their b
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
       
       file_data['files'].close()
 
@@ -155,7 +155,7 @@ You can specify the encoding to use to decode the text input. If no value is pro
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -204,7 +204,7 @@ You can also specify what languages to use for OCR with the ``ocr_languages`` kw
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -250,7 +250,7 @@ By default the result will be in ``json``, but it can be set to ``text/csv`` to
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -296,7 +296,7 @@ Pass the `include_page_breaks` parameter to `true` to include `PageBreak` elemen
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -345,7 +345,7 @@ On the other hand, ``hi_res`` is the better choice for PDFs that may have text w
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -398,7 +398,7 @@ To use the ``hi_res`` strategy with **Chipper** model, pass the argument for ``h
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -451,7 +451,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -499,7 +499,7 @@ We also provide support for enabling and disabling table extraction for file typ
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 
@@ -545,7 +545,7 @@ When processing XML documents, set the ``xml_keep_tags`` parameter to ``true`` t
       file_path = "/Path/To/File"
       file_data = {'files': open(file_path, 'rb')}
 
-      response = requests.post(url, headers=headers, files=files, data=data)
+      response = requests.post(url, headers=headers, files=file_data, data=data)
 
       file_data['files'].close()
 

diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -442,7 +442,7 @@ def test_partition_image_formats_languages_for_tesseract():
             ocr_languages="jpn_vert",
             ocr_mode="entire_page",
             extract_tables=False,
-            model_name="detectron2_onnx",
+            model_name=pdf.default_hi_res_model(),
         )
 
 

diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -406,7 +406,7 @@ def test_partition_pdf_with_dpi():
             ocr_languages="eng",
             ocr_mode="entire_page",
             extract_tables=False,
-            model_name="detectron2_onnx",
+            model_name=pdf.default_hi_res_model(),
             pdf_image_dpi=100,
         )
 
@@ -857,7 +857,7 @@ def test_partition_pdf_formats_languages_for_tesseract():
             ocr_languages="eng",
             ocr_mode="entire_page",
             extract_tables=False,
-            model_name="detectron2_onnx",
+            model_name=pdf.default_hi_res_model(),
         )
 
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -25,6 +25,7 @@
 from unstructured.partition import auto
 from unstructured.partition.auto import _get_partition_with_extras, partition
 from unstructured.partition.common import convert_office_doc
+from unstructured.partition.pdf import default_hi_res_model
 from unstructured.staging.base import elements_to_json
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -386,7 +387,7 @@ def test_auto_partition_formats_languages_for_tesseract():
             ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert",
             ocr_mode="entire_page",
             extract_tables=False,
-            model_name="detectron2_onnx",
+            model_name=default_hi_res_model(),
         )
 
 

diff --git a/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -57,7 +57,7 @@
     "text": "Lisa Federer, MLIS, Data Science Training Coordinator"
   },
   {
-    "type": "Title",
+    "type": "NarrativeText",
     "element_id": "d9644fb4b85468d186b132c91ca64f31",
     "metadata": {
       "data_source": {
@@ -77,7 +77,7 @@
   },
   {
     "type": "Title",
-    "element_id": "53d548aa01fc3eb72da15a5be7f235e2",
+    "element_id": "c8e51fdc53c202393adad77f7f93ee5a",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -115,7 +115,7 @@
   },
   {
     "type": "ListItem",
-    "element_id": "d94c6241299e6eff20ee6499cb9f64de",
+    "element_id": "04ff84b51fab69c07381ac794b740243",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -130,11 +130,87 @@
       "filetype": "application/pdf",
       "page_number": 1
     },
-    "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science; 2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python); 3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science; 4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science. 5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy."
+    "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
   },
   {
-    "type": "UncategorizedText",
-    "element_id": "34b28172088bba51c6764df6d4e87674",
+    "type": "ListItem",
+    "element_id": "0b2857001b1a9eba5e46e26cba08e2ac",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 1
+    },
+    "text": "2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python);"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "8b02f539eb8ccee5b3fc24f66858188c",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 1
+    },
+    "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;"
+  },
+  {
+    "type": "ListItem",
+    "element_id": "469e981f34d1e6f2b420574ed8e932d2",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 1
+    },
+    "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science."
+  },
+  {
+    "type": "ListItem",
+    "element_id": "4b8fc76cbba0e2fef79ff8bc668b1401",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 1
+    },
+    "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy."
+  },
+  {
+    "type": "NarrativeText",
+    "element_id": "69da7754428f154ee3b2906214d31ad9",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -153,7 +229,7 @@
   },
   {
     "type": "Title",
-    "element_id": "89b1f4c3df983454e25b233320781610",
+    "element_id": "37486ef32cbf05082d5dbff0581db762",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -209,7 +285,7 @@
     "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________"
   },
   {
-    "type": "Title",
+    "type": "NarrativeText",
     "element_id": "edd5f2f5a60a83c8899e533ac8bcd03c",
     "metadata": {
       "data_source": {
@@ -247,7 +323,7 @@
     "text": "Methodology"
   },
   {
-    "type": "Title",
+    "type": "NarrativeText",
     "element_id": "987542acede56f098db655f02fb814a7",
     "metadata": {
       "data_source": {
@@ -267,7 +343,7 @@
   },
   {
     "type": "ListItem",
-    "element_id": "fdd38e2d80cc964e9bf3c7e09a760e21",
+    "element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -282,10 +358,29 @@
       "filetype": "application/pdf",
       "page_number": 2
     },
-    "text": "a) Responses to a 2017 Kaggle' survey’ of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (83.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The"
+    "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use."
   },
   {
-    "type": "NarrativeText",
+    "type": "ListItem",
+    "element_id": "c6865d507571ccb14d37791134f27f61",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 2
+    },
+    "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
+  },
+  {
+    "type": "ListItem",
     "element_id": "3f14cc0782485365bad0539f7b1bbb22",
     "metadata": {
       "data_source": {
@@ -324,7 +419,7 @@
   },
   {
     "type": "NarrativeText",
-    "element_id": "8e6dc8d9bc74e032451cc1a6a0da4d10",
+    "element_id": "f39ddfa6365e505947527153b0ea60d8",
     "metadata": {
       "data_source": {
         "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -341,6 +436,25 @@
     },
     "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
   },
+  {
+    "type": "Footer",
+    "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+    "metadata": {
+      "data_source": {
+        "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
+        "version": 167189396509615428390709838081557906335,
+        "record_locator": {
+          "protocol": "abfs",
+          "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf"
+        },
+        "date_created": "2023-03-10T09:32:44+00:00",
+        "date_modified": "2023-03-10T09:32:44+00:00"
+      },
+      "filetype": "application/pdf",
+      "page_number": 2
+    },
+    "text": "2"
+  },
   {
     "type": "UncategorizedText",
     "element_id": "d4735e3a265e16eee03f59718b9b5d03",