diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe3c444ef8..c5b030b315 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
### Enhancements
+* **Better detection of natural reading order in images and PDF's** The elements returned by partition better reflect natural reading order in some cases, particularly in complicated multi-column layouts, leading to better chunking and retrieval for downstream applications. Achieved by improving the `xy-cut` sorting to preprocess bboxes, shrinking all bounding boxes by 90% along x and y axes (still centered around the same center point), which allows projection lines to be drawn where not possible before if layout bboxes overlapped.
* **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases.
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
* **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
index d9540344ef..d5dfcb8189 100644
--- a/test_unstructured/partition/pdf-image/test_pdf.py
+++ b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -479,7 +479,7 @@ def test_partition_pdf_fast_groups_text_in_text_box():
system=expected_coordinate_system_3,
),
)
- assert elements[3] == Text("2.5", metadata=expected_elem_metadata_3)
+ assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3)
def test_partition_pdf_with_metadata_filename(
diff --git a/test_unstructured/partition/utils/test_sorting.py b/test_unstructured/partition/utils/test_sorting.py
index 7bcf7a25d0..2000b4e3a3 100644
--- a/test_unstructured/partition/utils/test_sorting.py
+++ b/test_unstructured/partition/utils/test_sorting.py
@@ -5,10 +5,19 @@
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.sorting import (
coord_has_valid_points,
+ coordinates_to_bbox,
+ shrink_bbox,
sort_page_elements,
)
+class MockCoordinatesMetadata(CoordinatesMetadata):
+ def __init__(self, points):
+ system = PixelSpace(width=300, height=500)
+
+ super().__init__(points, system)
+
+
def test_coord_valid_coordinates():
coordinates = CoordinatesMetadata([(1, 2), (3, 4), (5, 6), (7, 8)], PixelSpace)
assert coord_has_valid_points(coordinates) is True
@@ -98,3 +107,21 @@ def test_sort_basic_pos_coordinates():
sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
assert sorted_elem_text == "7 8 9"
+
+
+def test_coordinates_to_bbox():
+ coordinates_data = MockCoordinatesMetadata([(10, 20), (10, 200), (100, 200), (100, 20)])
+ expected_result = (10, 20, 100, 200)
+ assert coordinates_to_bbox(coordinates_data) == expected_result
+
+
+def test_shrink_bbox():
+ bbox = (0, 0, 100, 100)
+ shrink_factor = 0.5
+ expected_result = (25, 25, 75, 75)
+ assert shrink_bbox(bbox, shrink_factor) == expected_result
+
+ bbox = (0, 0, 200, 100)
+ shrink_factor = 0.9
+ expected_result = (10, 5, 190, 95)
+ assert shrink_bbox(bbox, shrink_factor) == expected_result
diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
index f3eba38c14..7cbf4decf9 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -266,8 +266,8 @@
"text": "Executive Summary"
},
{
- "type": "NarrativeText",
- "element_id": "2364a6d2f9a3858d51d91b817732e6c9",
+ "type": "Title",
+ "element_id": "6712d87f1d156abf6171f700e2875889",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -282,11 +282,11 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs."
+ "text": "biomedical"
},
{
- "type": "Title",
- "element_id": "6712d87f1d156abf6171f700e2875889",
+ "type": "NarrativeText",
+ "element_id": "2364a6d2f9a3858d51d91b817732e6c9",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -301,7 +301,7 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "biomedical"
+ "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs."
},
{
"type": "Title",
@@ -836,8 +836,8 @@
"text": "The"
},
{
- "type": "NarrativeText",
- "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8",
+ "type": "Title",
+ "element_id": "aa3b88196a6407c3866c85acdcc8c981",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -852,11 +852,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "required of"
+ "text": "Workforce"
},
{
- "type": "Title",
- "element_id": "aa3b88196a6407c3866c85acdcc8c981",
+ "type": "NarrativeText",
+ "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -871,7 +871,7 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Workforce"
+ "text": "required of"
},
{
"type": "NarrativeText",
@@ -1083,8 +1083,8 @@
"text": "b)"
},
{
- "type": "NarrativeText",
- "element_id": "1117af46b0a22dd02d3869ab9738a8a8",
+ "type": "Title",
+ "element_id": "6b847a0ed0b2c484c73f2749e29b4db5",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -1099,11 +1099,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
+ "text": "into"
},
{
- "type": "Title",
- "element_id": "6b847a0ed0b2c484c73f2749e29b4db5",
+ "type": "NarrativeText",
+ "element_id": "1117af46b0a22dd02d3869ab9738a8a8",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -1118,7 +1118,7 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "into"
+ "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
},
{
"type": "NarrativeText",
@@ -1197,8 +1197,8 @@
"text": "c)"
},
{
- "type": "NarrativeText",
- "element_id": "961a38da2886c3cc25091d912769aa0d",
+ "type": "Title",
+ "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -1213,7 +1213,7 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad."
+ "text": "Desired"
},
{
"type": "NarrativeText",
@@ -1235,8 +1235,8 @@
"text": "important skills that were mentioned multiple times in"
},
{
- "type": "Title",
- "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385",
+ "type": "NarrativeText",
+ "element_id": "961a38da2886c3cc25091d912769aa0d",
"metadata": {
"data_source": {
"url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf",
@@ -1251,7 +1251,7 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Desired"
+ "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad."
},
{
"type": "Title",
diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
index 4355d36569..9a30d93103 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
@@ -229,7 +229,7 @@
},
{
"type": "NarrativeText",
- "element_id": "eb076cfd3d47e546c28611750afedc49",
+ "element_id": "0b320308ba52d4a9625d29cadfc941a9",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -244,11 +244,11 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and"
+ "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and"
},
{
"type": "NarrativeText",
- "element_id": "0b320308ba52d4a9625d29cadfc941a9",
+ "element_id": "eb076cfd3d47e546c28611750afedc49",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -263,7 +263,7 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and"
+ "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and"
},
{
"type": "NarrativeText",
@@ -305,7 +305,7 @@
},
{
"type": "Title",
- "element_id": "af8bdf713f162b09567c8d1a3a2d4de7",
+ "element_id": "5756fb398995bb6518a87637f24f426e",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -320,11 +320,11 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change."
+ "text": "Time and Place for Filing"
},
{
"type": "Title",
- "element_id": "5756fb398995bb6518a87637f24f426e",
+ "element_id": "af8bdf713f162b09567c8d1a3a2d4de7",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -339,7 +339,7 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Time and Place for Filing"
+ "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change."
},
{
"type": "NarrativeText",
@@ -494,8 +494,8 @@
"text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block."
},
{
- "type": "ListItem",
- "element_id": "ede9004eceddf828c2c928f62d0687a0",
+ "type": "Title",
+ "element_id": "f1a73e2204a114077f988c9da98d7f8b",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -510,11 +510,11 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to"
+ "text": "Signature"
},
{
- "type": "Title",
- "element_id": "f1a73e2204a114077f988c9da98d7f8b",
+ "type": "ListItem",
+ "element_id": "ede9004eceddf828c2c928f62d0687a0",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -529,7 +529,7 @@
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Signature"
+ "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to"
},
{
"type": "Title",
@@ -704,7 +704,7 @@
},
{
"type": "NarrativeText",
- "element_id": "751abc8c6a0fa412c3e8c18345f57f95",
+ "element_id": "678ecc0340dc8848f891bf12a555a3fd",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -719,11 +719,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable."
+ "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time."
},
{
"type": "NarrativeText",
- "element_id": "678ecc0340dc8848f891bf12a555a3fd",
+ "element_id": "751abc8c6a0fa412c3e8c18345f57f95",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -738,11 +738,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time."
+ "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable."
},
{
- "type": "Title",
- "element_id": "136a59b0c53731bc299206fda46e0888",
+ "type": "NarrativeText",
+ "element_id": "64758ada28beed36481b14ce8dc67472",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -757,11 +757,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section B-1"
+ "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T."
},
{
- "type": "NarrativeText",
- "element_id": "e4a695ea83818204438fe08add6d1554",
+ "type": "Title",
+ "element_id": "53e33d10c9df4a570490182ccef0cd95",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -776,11 +776,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application."
+ "text": "Section C"
},
{
"type": "Title",
- "element_id": "f63f53aab435b8c9789ab7d6b982db3f",
+ "element_id": "8d6743276d5bc8e32d0b05ba0b232db8",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -795,11 +795,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Sections B-2 and B-3"
+ "text": "Section E"
},
{
- "type": "Title",
- "element_id": "4688916bf1d6b205af02a0e954156688",
+ "type": "ListItem",
+ "element_id": "86fab9f7b35d56a2d48baf0782b7c53d",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -814,11 +814,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C"
+ "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete."
},
{
- "type": "NarrativeText",
- "element_id": "aaf93c2be8f4f2db87bd760783fedfa5",
+ "type": "ListItem",
+ "element_id": "84cea2af17bb3760234b42f4ea78e175",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -833,11 +833,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities."
+ "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts."
},
{
- "type": "NarrativeText",
- "element_id": "e5bed7fe04dd22cabe5e5c0362d37743",
+ "type": "Title",
+ "element_id": "136a59b0c53731bc299206fda46e0888",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -852,11 +852,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—"
+ "text": "Section B-1"
},
{
- "type": "ListItem",
- "element_id": "69bd87b2ad5873c030748e62adf61b89",
+ "type": "NarrativeText",
+ "element_id": "e4a695ea83818204438fe08add6d1554",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -871,11 +871,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method."
+ "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application."
},
{
- "type": "NarrativeText",
- "element_id": "0607edfa2419dd0cdc80f457872fe238",
+ "type": "Title",
+ "element_id": "f63f53aab435b8c9789ab7d6b982db3f",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -890,11 +890,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law,"
+ "text": "Sections B-2 and B-3"
},
{
- "type": "NarrativeText",
- "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd",
+ "type": "Title",
+ "element_id": "4688916bf1d6b205af02a0e954156688",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -909,11 +909,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)"
+ "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C"
},
{
"type": "NarrativeText",
- "element_id": "64758ada28beed36481b14ce8dc67472",
+ "element_id": "aaf93c2be8f4f2db87bd760783fedfa5",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -928,11 +928,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T."
+ "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities."
},
{
- "type": "Title",
- "element_id": "53e33d10c9df4a570490182ccef0cd95",
+ "type": "NarrativeText",
+ "element_id": "e5bed7fe04dd22cabe5e5c0362d37743",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -947,11 +947,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section C"
+ "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—"
},
{
- "type": "NarrativeText",
- "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6",
+ "type": "ListItem",
+ "element_id": "69bd87b2ad5873c030748e62adf61b89",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -966,11 +966,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8."
+ "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method."
},
{
"type": "NarrativeText",
- "element_id": "357d52f500b965abc29ea60039de4fd8",
+ "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -985,11 +985,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:"
+ "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8."
},
{
"type": "NarrativeText",
- "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf",
+ "element_id": "357d52f500b965abc29ea60039de4fd8",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1004,11 +1004,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726."
+ "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:"
},
{
"type": "NarrativeText",
- "element_id": "6028c579dc843bb5aa2c704f46085914",
+ "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1023,11 +1023,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event."
+ "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726."
},
{
- "type": "Title",
- "element_id": "92e21a61e1d872dbbe3e3221a920b409",
+ "type": "NarrativeText",
+ "element_id": "6028c579dc843bb5aa2c704f46085914",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1042,11 +1042,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section D"
+ "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event."
},
{
- "type": "NarrativeText",
- "element_id": "a8e72799229bc2d754f44ea167a6e7d6",
+ "type": "Title",
+ "element_id": "92e21a61e1d872dbbe3e3221a920b409",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1061,11 +1061,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods."
+ "text": "Section D"
},
{
"type": "Title",
- "element_id": "8d6743276d5bc8e32d0b05ba0b232db8",
+ "element_id": "32786e68a6fd82dc356d2d58bf283dc4",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1080,11 +1080,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section E"
+ "text": "Section G"
},
{
- "type": "ListItem",
- "element_id": "86fab9f7b35d56a2d48baf0782b7c53d",
+ "type": "NarrativeText",
+ "element_id": "fa41a857716f30d6bbee384eada72a90",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1099,11 +1099,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete."
+ "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)."
},
{
- "type": "ListItem",
- "element_id": "84cea2af17bb3760234b42f4ea78e175",
+ "type": "Title",
+ "element_id": "a8155ab3bed92cc259ab58331619e0e1",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1118,11 +1118,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts."
+ "text": "Section H"
},
{
- "type": "Title",
- "element_id": "32786e68a6fd82dc356d2d58bf283dc4",
+ "type": "NarrativeText",
+ "element_id": "cb1f664a186a87f6560cde136d70b558",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1137,11 +1137,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section G"
+ "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested."
},
{
"type": "NarrativeText",
- "element_id": "fa41a857716f30d6bbee384eada72a90",
+ "element_id": "86d11953bb813a770ecd242ff97d4e43",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1156,11 +1156,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)."
+ "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10."
},
{
- "type": "Title",
- "element_id": "a8155ab3bed92cc259ab58331619e0e1",
+ "type": "NarrativeText",
+ "element_id": "0607edfa2419dd0cdc80f457872fe238",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1175,11 +1175,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Section H"
+ "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law,"
},
{
"type": "NarrativeText",
- "element_id": "cb1f664a186a87f6560cde136d70b558",
+ "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1194,11 +1194,11 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested."
+ "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)"
},
{
"type": "NarrativeText",
- "element_id": "86d11953bb813a770ecd242ff97d4e43",
+ "element_id": "a8e72799229bc2d754f44ea167a6e7d6",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.pdf",
@@ -1213,7 +1213,7 @@
"filetype": "application/pdf",
"page_number": 2
},
- "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10."
+ "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods."
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
index f89aa759ad..5afaa3fefc 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
@@ -266,8 +266,8 @@
"text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986"
},
{
- "type": "NarrativeText",
- "element_id": "b07efea243933525e9ec04a90622508d",
+ "type": "Title",
+ "element_id": "11c98a9cbd6a200fbc5b93fed15007ac",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -282,11 +282,11 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required"
+ "text": "Uniform capitalization rules and limitation on"
},
{
- "type": "Title",
- "element_id": "11c98a9cbd6a200fbc5b93fed15007ac",
+ "type": "NarrativeText",
+ "element_id": "b07efea243933525e9ec04a90622508d",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -301,7 +301,7 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "Uniform capitalization rules and limitation on"
+ "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required"
},
{
"type": "NarrativeText",
@@ -475,8 +475,8 @@
"text": "If your application is filed after the 180-day period, itis late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63."
},
{
- "type": "NarrativeText",
- "element_id": "ec3c2d03b846d2a186fc9a8f318f688b",
+ "type": "Title",
+ "element_id": "025a65465b6fd9635316e92633b24c7e",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -491,11 +491,11 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both."
+ "text": "Identifying Number"
},
{
- "type": "Title",
- "element_id": "025a65465b6fd9635316e92633b24c7e",
+ "type": "NarrativeText",
+ "element_id": "ec3c2d03b846d2a186fc9a8f318f688b",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -510,7 +510,7 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "Identifying Number"
+ "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both."
},
{
"type": "NarrativeText",
@@ -532,8 +532,8 @@
"text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
},
{
- "type": "ListItem",
- "element_id": "f8e8c87d2e958a23153d7f25b159f0ee",
+ "type": "Title",
+ "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -548,11 +548,11 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6."
+ "text": "Signature tea"
},
{
- "type": "Title",
- "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9",
+ "type": "ListItem",
+ "element_id": "f8e8c87d2e958a23153d7f25b159f0ee",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
@@ -567,7 +567,7 @@
"filetype": "image/png",
"page_number": 1
},
- "text": "Signature tea"
+ "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6."
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
index af074dbe60..9c98b4af47 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -580,14 +580,14 @@
"text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES."
},
{
- "type": "UncategorizedText",
- "element_id": "bbf3f11cb5b43e700273a78d12de55e4",
+ "type": "Title",
+ "element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "%"
+ "text": "i"
},
{
"type": "NarrativeText",
@@ -600,24 +600,24 @@
"text": ") r a e y / m m"
},
{
- "type": "UncategorizedText",
- "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
+ "type": "NarrativeText",
+ "element_id": "49e7364ce1027887460959b2a757b184",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "("
+ "text": "( e t a r n o s o r r o C"
},
{
"type": "NarrativeText",
- "element_id": "49e7364ce1027887460959b2a757b184",
+ "element_id": "74599fca46202613cccb12e97774b306",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "( e t a r n o s o r r o C"
+ "text": "E n o i t i b h n I"
},
{
"type": "Title",
@@ -639,55 +639,45 @@
},
"text": "i"
},
- {
- "type": "UncategorizedText",
- "element_id": "ba5ec51d07a4ac0e951608704431d59a",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 3
- },
- "text": ")"
- },
{
"type": "NarrativeText",
- "element_id": "74599fca46202613cccb12e97774b306",
+ "element_id": "bbe120714b80df07396e808f98b3f354",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "E n o i t i b h n I"
+ "text": "y c n e c i f f"
},
{
- "type": "Title",
- "element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
+ "type": "UncategorizedText",
+ "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "i"
+ "text": "("
},
{
- "type": "NarrativeText",
- "element_id": "bbe120714b80df07396e808f98b3f354",
+ "type": "UncategorizedText",
+ "element_id": "bbf3f11cb5b43e700273a78d12de55e4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "y c n e c i f f"
+ "text": "%"
},
{
"type": "UncategorizedText",
- "element_id": "525fbe4b6760bd759bfeeae2ee487f12",
+ "element_id": "ba5ec51d07a4ac0e951608704431d59a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1"
+ "text": ")"
},
{
"type": "UncategorizedText",
@@ -731,23 +721,23 @@
},
{
"type": "UncategorizedText",
- "element_id": "4a44dc15364204a80fe80e9039455cc1",
+ "element_id": "525fbe4b6760bd759bfeeae2ee487f12",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "10"
+ "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1"
},
{
"type": "UncategorizedText",
- "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
+ "element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "20"
+ "text": "10"
},
{
"type": "UncategorizedText",
@@ -761,73 +751,73 @@
},
{
"type": "UncategorizedText",
- "element_id": "d59eced1ded07f84c145592f65bdf854",
+ "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "40"
+ "text": "20"
},
{
"type": "UncategorizedText",
- "element_id": "1a6562590ef19d1045d06c4055742d38",
+ "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "50"
+ "text": "90"
},
{
"type": "UncategorizedText",
- "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
+ "element_id": "d59eced1ded07f84c145592f65bdf854",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "70"
+ "text": "40"
},
{
"type": "UncategorizedText",
- "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
+ "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "60"
+ "text": "80"
},
{
"type": "UncategorizedText",
- "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
+ "element_id": "1a6562590ef19d1045d06c4055742d38",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "80"
+ "text": "50"
},
{
"type": "UncategorizedText",
- "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63",
+ "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "90"
+ "text": "70"
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "0"
+ "text": "60"
},
{
"type": "UncategorizedText",
@@ -939,6 +929,16 @@
},
"text": "2g 4g 6g 8g 10g"
},
+ {
+ "type": "UncategorizedText",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 3
+ },
+ "text": "0"
+ },
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
@@ -1110,14 +1110,14 @@
"text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution."
},
{
- "type": "Table",
- "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe",
+ "type": "UncategorizedText",
+ "element_id": "9492908fadeab22ca81f18f2ba4f4f35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
+ "text": "0 2 4 6 8 10"
},
{
"type": "Title",
@@ -1129,15 +1129,25 @@
},
"text": "Inhibitor concentration (g)"
},
+ {
+ "type": "Table",
+ "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 4
+ },
+ "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
+ },
{
"type": "UncategorizedText",
- "element_id": "9492908fadeab22ca81f18f2ba4f4f35",
+ "element_id": "12751f842ba5664e7ad255016dbe371b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0 2 4 6 8 10"
+ "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382"
},
{
"type": "Title",
@@ -1151,13 +1161,13 @@
},
{
"type": "UncategorizedText",
- "element_id": "12751f842ba5664e7ad255016dbe371b",
+ "element_id": "727d4758bcfadaaf5156b8682cd39810",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382"
+ "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086"
},
{
"type": "Title",
@@ -1170,34 +1180,34 @@
"text": "ba (V/dec)"
},
{
- "type": "UncategorizedText",
- "element_id": "727d4758bcfadaaf5156b8682cd39810",
+ "type": "Title",
+ "element_id": "7bc31ed7ab5a625735657499f636c1f2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086"
+ "text": "Ecorr (V)"
},
{
- "type": "Title",
- "element_id": "7bc31ed7ab5a625735657499f636c1f2",
+ "type": "UncategorizedText",
+ "element_id": "2a789110c863b30156d63234c8a51477",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Ecorr (V)"
+ "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356"
},
{
"type": "UncategorizedText",
- "element_id": "2a789110c863b30156d63234c8a51477",
+ "element_id": "d71f426079cb8c2bb3d960ce1e23d290",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356"
+ "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
},
{
"type": "Title",
@@ -1211,13 +1221,13 @@
},
{
"type": "UncategorizedText",
- "element_id": "d71f426079cb8c2bb3d960ce1e23d290",
+ "element_id": "1695e2ad2c62a337b135afbfc79ef69d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
+ "text": "24.0910 121.440 42.121 373.180 305.650 246.080"
},
{
"type": "Title",
@@ -1231,13 +1241,13 @@
},
{
"type": "UncategorizedText",
- "element_id": "1695e2ad2c62a337b135afbfc79ef69d",
+ "element_id": "48bbf8e8b874e0e1f32be15f6c07c11c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "24.0910 121.440 42.121 373.180 305.650 246.080"
+ "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919"
},
{
"type": "Title",
@@ -1249,16 +1259,6 @@
},
"text": "Corrosion rate (mm/year)"
},
- {
- "type": "UncategorizedText",
- "element_id": "48bbf8e8b874e0e1f32be15f6c07c11c",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 4
- },
- "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919"
- },
{
"type": "NarrativeText",
"element_id": "ef5851c1e7629b7329ac014d7fb9e9e1",
@@ -1331,33 +1331,33 @@
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "a0dfa682f99b0794f40f195f9a7adfcd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2"
+ "text": "—=—Cc/0 2+ T T T 1"
},
{
"type": "UncategorizedText",
- "element_id": "a0dfa682f99b0794f40f195f9a7adfcd",
+ "element_id": "1797d9b8b07f302836186c20a19ebd0b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "—=—Cc/0 2+ T T T 1"
+ "text": "C/0"
},
{
"type": "UncategorizedText",
- "element_id": "1797d9b8b07f302836186c20a19ebd0b",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "C/0"
+ "text": "2"
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
index 59ec34c634..d7bdce8ec2 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -200,34 +200,34 @@
"text": "Specifications table"
},
{
- "type": "NarrativeText",
- "element_id": "5c3978ebc42ea4f11240c221ac3be1cf",
+ "type": "Title",
+ "element_id": "41e0fa358cefcadbb2633ec45ff2d129",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired"
+ "text": "Data format Experimental factors"
},
{
"type": "Title",
- "element_id": "41e0fa358cefcadbb2633ec45ff2d129",
+ "element_id": "27d70c97431a2bec06d0a89368489dfb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Data format Experimental factors"
+ "text": "Experimental features Data source location Data accessibility Related research article"
},
{
- "type": "Title",
- "element_id": "27d70c97431a2bec06d0a89368489dfb",
+ "type": "NarrativeText",
+ "element_id": "5c3978ebc42ea4f11240c221ac3be1cf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Experimental features Data source location Data accessibility Related research article"
+ "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired"
},
{
"type": "ListItem",
@@ -259,16 +259,6 @@
},
"text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the"
},
- {
- "type": "NarrativeText",
- "element_id": "7c8bc2811f71480b433eb6fee2a3bb33",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 2
- },
- "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing"
- },
{
"type": "Title",
"element_id": "bd7d750cb9f652c80c17a264072b8858",
@@ -281,13 +271,13 @@
},
{
"type": "NarrativeText",
- "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb",
+ "element_id": "7c8bc2811f71480b433eb6fee2a3bb33",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can"
+ "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing"
},
{
"type": "Title",
@@ -301,13 +291,13 @@
},
{
"type": "NarrativeText",
- "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f",
+ "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes."
+ "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can"
},
{
"type": "NarrativeText",
@@ -319,6 +309,16 @@
},
"text": "be used for the comparison."
},
+ {
+ "type": "NarrativeText",
+ "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 2
+ },
+ "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes."
+ },
{
"type": "ListItem",
"element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa",
@@ -340,24 +340,24 @@
"text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number"
},
{
- "type": "NarrativeText",
- "element_id": "a18c70d23b71c51ddfe33311232c241c",
+ "type": "Title",
+ "element_id": "10c22bcf4c768b515be4e94bcafc71bf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net."
+ "text": "for"
},
{
- "type": "Title",
- "element_id": "10c22bcf4c768b515be4e94bcafc71bf",
+ "type": "NarrativeText",
+ "element_id": "a18c70d23b71c51ddfe33311232c241c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "for"
+ "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net."
},
{
"type": "UncategorizedText",
@@ -621,23 +621,23 @@
},
{
"type": "NarrativeText",
- "element_id": "e731dc92fddc0512e142bfb2bed62bbf",
+ "element_id": "1c59f2a7ce8a3fa55810df93d58e636e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ."
+ "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule."
},
{
- "type": "NarrativeText",
- "element_id": "1c59f2a7ce8a3fa55810df93d58e636e",
+ "type": "Title",
+ "element_id": "252f10c83610ebca1a059c0bae8255eb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule."
+ "text": "f"
},
{
"type": "NarrativeText",
@@ -649,16 +649,6 @@
},
"text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l"
},
- {
- "type": "Title",
- "element_id": "252f10c83610ebca1a059c0bae8255eb",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 3
- },
- "text": "f"
- },
{
"type": "UncategorizedText",
"element_id": "89507815c6b4a6f31e6d3da7fca6b561",
@@ -689,6 +679,16 @@
},
"text": "."
},
+ {
+ "type": "NarrativeText",
+ "element_id": "e731dc92fddc0512e142bfb2bed62bbf",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 3
+ },
+ "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ."
+ },
{
"type": "UncategorizedText",
"element_id": "d8e33a2b60213fb3cebaf5c3a36b0b63",
@@ -700,14 +700,14 @@
"text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size."
},
{
- "type": "Table",
- "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6",
+ "type": "UncategorizedText",
+ "element_id": "6d1f07a97479928ee102d525dd11d2d7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60"
+ "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)"
},
{
"type": "Title",
@@ -720,24 +720,24 @@
"text": "Instance size (m, n)"
},
{
- "type": "UncategorizedText",
- "element_id": "6d1f07a97479928ee102d525dd11d2d7",
+ "type": "Table",
+ "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)"
+ "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60"
},
{
- "type": "Title",
- "element_id": "47a68d3aa70030f2e7886e3f1cb07c69",
+ "type": "UncategorizedText",
+ "element_id": "1cb85e5f94671526c0cf38dc533f87e0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Average number of"
+ "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20"
},
{
"type": "Title",
@@ -750,14 +750,14 @@
"text": "Locations"
},
{
- "type": "UncategorizedText",
- "element_id": "1cb85e5f94671526c0cf38dc533f87e0",
+ "type": "Title",
+ "element_id": "47a68d3aa70030f2e7886e3f1cb07c69",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20"
+ "text": "Average number of"
},
{
"type": "Title",
@@ -800,24 +800,24 @@
"text": "652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60"
},
{
- "type": "Title",
- "element_id": "68ec9a56bde1cd8ea67340bf9cb829cb",
+ "type": "UncategorizedText",
+ "element_id": "4a30645cb68832ec26e551345d9cff0a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Possible empty travels"
+ "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60"
},
{
- "type": "UncategorizedText",
- "element_id": "4a30645cb68832ec26e551345d9cff0a",
+ "type": "Title",
+ "element_id": "68ec9a56bde1cd8ea67340bf9cb829cb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60"
+ "text": "Possible empty travels"
},
{
"type": "NarrativeText",
@@ -920,24 +920,24 @@
"text": "l"
},
{
- "type": "NarrativeText",
- "element_id": "78f6ff03dfac8dfb7f319de1e369590d",
+ "type": "Title",
+ "element_id": "336074805fc853987abe6f7fe3ad97a6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j."
+ "text": "time"
},
{
- "type": "Title",
- "element_id": "336074805fc853987abe6f7fe3ad97a6",
+ "type": "NarrativeText",
+ "element_id": "78f6ff03dfac8dfb7f319de1e369590d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "time"
+ "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j."
},
{
"type": "Title",
@@ -1081,23 +1081,23 @@
},
{
"type": "NarrativeText",
- "element_id": "16c341408703257ff517dcc76140e2c0",
+ "element_id": "c4f2c64b5f38feaa921647abceebaec8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling"
+ "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487."
},
{
"type": "NarrativeText",
- "element_id": "c4f2c64b5f38feaa921647abceebaec8",
+ "element_id": "16c341408703257ff517dcc76140e2c0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487."
+ "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling"
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
index c96928b601..24ce361e7b 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
@@ -60,34 +60,34 @@
"text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS"
},
{
- "type": "UncategorizedText",
- "element_id": "e97f1cf1c49f397732e68cf1efb2355e",
+ "type": "NarrativeText",
+ "element_id": "d981d6dfaa8794c0bb733db0965b2831",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
+ "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford"
},
{
- "type": "NarrativeText",
- "element_id": "1252f8d8921acac5f706e4402e504a75",
+ "type": "UncategorizedText",
+ "element_id": "e97f1cf1c49f397732e68cf1efb2355e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
+ "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
},
{
"type": "NarrativeText",
- "element_id": "d981d6dfaa8794c0bb733db0965b2831",
+ "element_id": "1252f8d8921acac5f706e4402e504a75",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford"
+ "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
index 6f3354a254..30302a3ffa 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -49,6 +49,16 @@
},
"text": "1 2"
},
+ {
+ "type": "ListItem",
+ "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 1
+ },
+ "text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca"
+ },
{
"type": "UncategorizedText",
"element_id": "cfae0d4248f7142f7b17f826cd7a5192",
@@ -79,16 +89,6 @@
},
"text": "2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a"
},
- {
- "type": "ListItem",
- "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 1
- },
- "text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca"
- },
{
"type": "NarrativeText",
"element_id": "be90d2640470e975e3402d19ba2c66cf",
@@ -241,23 +241,23 @@
},
{
"type": "Title",
- "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba",
+ "element_id": "50f59772d4134ececeaf37069d480784",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "recognition, and other DIA tasks (Section 3)"
+ "text": "underlies the off-the-shelf usage"
},
{
"type": "Title",
- "element_id": "50f59772d4134ececeaf37069d480784",
+ "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "underlies the off-the-shelf usage"
+ "text": "recognition, and other DIA tasks (Section 3)"
},
{
"type": "NarrativeText",
@@ -301,23 +301,23 @@
},
{
"type": "NarrativeText",
- "element_id": "9b8fc4816306f4f1b31874d53134979b",
+ "element_id": "74a7758f83612467af8eea9d20e4a6f7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "The rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes."
+ "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned."
},
{
"type": "NarrativeText",
- "element_id": "74a7758f83612467af8eea9d20e4a6f7",
+ "element_id": "9b8fc4816306f4f1b31874d53134979b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned."
+ "text": "The rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes."
},
{
"type": "Title",
@@ -470,15 +470,14 @@
"text": "5"
},
{
- "type": "Table",
- "element_id": "34923b77ca76e1808956ade5e766f7c2",
+ "type": "NarrativeText",
+ "element_id": "b51f99cb953082a922ba43c09d4492b3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
- "page_number": 5,
- "text_as_html": "
Dataset | | Base Model'| | Large Model | Notes | PubLayNet B8]| | F/M | M | Layouts of modern scientific documents |
| M | - | Layouts of scanned modern magazines and scientific reports |
| F | - | Layouts of scanned US newspapers from the 20th century |
TableBank | F | F | nd business document. Table region on modern scientific |
HJDataset | F/M | - | Layouts of history Japanese documents |
"
+ "page_number": 5
},
- "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents"
+ "text": "Table 1: Current layout detection models in the LayoutParser model zoo"
},
{
"type": "NarrativeText",
@@ -491,14 +490,15 @@
"text": "PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]"
},
{
- "type": "NarrativeText",
- "element_id": "b51f99cb953082a922ba43c09d4492b3",
+ "type": "Table",
+ "element_id": "34923b77ca76e1808956ade5e766f7c2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
- "page_number": 5
+ "page_number": 5,
+ "text_as_html": "Dataset | | Base Model'| | Large Model | Notes | PubLayNet B8]| | F/M | M | Layouts of modern scientific documents |
| M | - | Layouts of scanned modern magazines and scientific reports |
| F | - | Layouts of scanned US newspapers from the 20th century |
TableBank | F | F | nd business document. Table region on modern scientific |
HJDataset | F/M | - | Layouts of history Japanese documents |
"
},
- "text": "Table 1: Current layout detection models in the LayoutParser model zoo"
+ "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents"
},
{
"type": "Title",
@@ -561,34 +561,34 @@
"text": "1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months."
},
{
- "type": "NarrativeText",
- "element_id": "11dff8778699e76422be6b86c9eaa62a",
+ "type": "Title",
+ "element_id": "9f26ca353a2c130a2e32f457d71c1350",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:"
+ "text": "3.1 Layout Detection Models"
},
{
"type": "NarrativeText",
- "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9",
+ "element_id": "11dff8778699e76422be6b86c9eaa62a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component."
+ "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:"
},
{
- "type": "Title",
- "element_id": "9f26ca353a2c130a2e32f457d71c1350",
+ "type": "NarrativeText",
+ "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "3.1 Layout Detection Models"
+ "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component."
},
{
"type": "NarrativeText",
@@ -661,34 +661,34 @@
"text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff"
},
{
- "type": "NarrativeText",
- "element_id": "cafae07120d714f0822e89865adf62da",
+ "type": "Title",
+ "element_id": "acd4f4584a990134d927e19b6d7e5f88",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
- "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility."
+ "text": "3.2 Layout Data Structures"
},
{
"type": "NarrativeText",
- "element_id": "7461d30ee7c51c91bca8003792d43bfe",
+ "element_id": "cafae07120d714f0822e89865adf62da",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
- "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5)."
+ "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility."
},
{
- "type": "Title",
- "element_id": "acd4f4584a990134d927e19b6d7e5f88",
+ "type": "NarrativeText",
+ "element_id": "7461d30ee7c51c91bca8003792d43bfe",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
- "text": "3.2 Layout Data Structures"
+ "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5)."
},
{
"type": "NarrativeText",
@@ -721,34 +721,34 @@
"text": "7"
},
{
- "type": "NarrativeText",
- "element_id": "e284bd66511cfa064681253e7ac57a9a",
+ "type": "Title",
+ "element_id": "89c6cd1d893f782ea68d75737e3393fd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7
},
- "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:"
+ "text": "3.3 OCR"
},
{
"type": "NarrativeText",
- "element_id": "eec800eef6e395c21feacd729868dd18",
+ "element_id": "e284bd66511cfa064681253e7ac57a9a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort."
+ "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:"
},
{
- "type": "Title",
- "element_id": "89c6cd1d893f782ea68d75737e3393fd",
+ "type": "NarrativeText",
+ "element_id": "eec800eef6e395c21feacd729868dd18",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.3 OCR"
+ "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort."
},
{
"type": "NarrativeText",
@@ -831,115 +831,115 @@
"text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout."
},
{
- "type": "Table",
- "element_id": "f81d4915b54758e0d4d52af3566bb813",
+ "type": "Title",
+ "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
- "page_number": 8,
- "text_as_html": "Operation Name | | | Description | block.pad(top, bottom, | right, | left) | | Enlarge the current block according to the input |
block.scale(fx, fy) | | | Scale the current block given the ratio ; in x and y direction |
. block.shift(dx, dy) | | | Move the current block with the shift : : a distances in x and y direction |
block1.is_in(block2) | | | Whether block] is inside of block2 |
. block1. intersect (block2) | | | Return the intersection region of block1 and block2. . . . Coordinate type to be determined based on the inputs. |
. block1.union(block2) | | | Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs. |
. block1.relative_to(block2) | | | Convert the absolute coordinates of block to ' ' relative coordinates to block2 |
. block1.condition_on(block2) | | | Calculate the absolute coordinates of blockl given . the canvas block2’s absolute coordinates |
block. crop_image (image) | | | Obtain the image segments in the block region |
"
+ "page_number": 8
},
- "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region"
+ "text": "Operation Name"
},
{
"type": "Title",
- "element_id": "2092f29df87c3cfd32244b325faaba33",
+ "element_id": "505791f52a5741b58f5dd02836da7b31",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block1.condition on(block2)"
+ "text": "block1.union(block2)"
},
{
"type": "Title",
- "element_id": "aac9bbf1c375a005651b5d2929778d3b",
+ "element_id": "acfa5090fbb8986000a92d84d41d8140",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block1.relative to(block2)"
+ "text": "block1.is in(block2)"
},
{
"type": "Title",
- "element_id": "505791f52a5741b58f5dd02836da7b31",
+ "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block1.union(block2)"
+ "text": "block.scale(fx, fy)"
},
{
"type": "Title",
- "element_id": "39fca1b21a889218bd84127a4d7f27c5",
+ "element_id": "1c1464d6a8f85d78202f67293ee7ac42",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block1.intersect(block2)"
+ "text": "block.shift(dx, dy)"
},
{
"type": "Title",
- "element_id": "1c1464d6a8f85d78202f67293ee7ac42",
+ "element_id": "39fca1b21a889218bd84127a4d7f27c5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block.shift(dx, dy)"
+ "text": "block1.intersect(block2)"
},
{
"type": "Title",
- "element_id": "acfa5090fbb8986000a92d84d41d8140",
+ "element_id": "aac9bbf1c375a005651b5d2929778d3b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block1.is in(block2)"
+ "text": "block1.relative to(block2)"
},
{
"type": "Title",
- "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be",
+ "element_id": "2092f29df87c3cfd32244b325faaba33",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block.scale(fx, fy)"
+ "text": "block1.condition on(block2)"
},
{
- "type": "NarrativeText",
- "element_id": "f60c4482bfe6a1b0eb9095bb8cf21e64",
+ "type": "Table",
+ "element_id": "f81d4915b54758e0d4d52af3566bb813",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
- "page_number": 8
+ "page_number": 8,
+ "text_as_html": "Operation Name | | | Description | block.pad(top, bottom, | right, | left) | | Enlarge the current block according to the input |
block.scale(fx, fy) | | | Scale the current block given the ratio ; in x and y direction |
. block.shift(dx, dy) | | | Move the current block with the shift : : a distances in x and y direction |
block1.is_in(block2) | | | Whether block] is inside of block2 |
. block1. intersect (block2) | | | Return the intersection region of block1 and block2. . . . Coordinate type to be determined based on the inputs. |
. block1.union(block2) | | | Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs. |
. block1.relative_to(block2) | | | Convert the absolute coordinates of block to ' ' relative coordinates to block2 |
. block1.condition_on(block2) | | | Calculate the absolute coordinates of blockl given . the canvas block2’s absolute coordinates |
block. crop_image (image) | | | Obtain the image segments in the block region |
"
},
- "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input"
+ "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region"
},
{
- "type": "Title",
- "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75",
+ "type": "NarrativeText",
+ "element_id": "f60c4482bfe6a1b0eb9095bb8cf21e64",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Operation Name"
+ "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input"
},
{
"type": "Title",
- "element_id": "7d52bf6c2abc8aebeda26c2400f00ddd",
+ "element_id": "526e0087cc3f254d9f86f6c7d8e23d95",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "block.crop image(image)"
+ "text": "Description"
},
{
"type": "NarrativeText",
@@ -952,84 +952,84 @@
"text": "Whether block1 is inside of block2"
},
{
- "type": "Title",
- "element_id": "fdf3d6c91387c02a0cdaa1ff6b3c67c5",
+ "type": "UncategorizedText",
+ "element_id": "a270fb0a45b9ed73f992f73dbf0b9a3f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Obtain the image segments in the block region"
+ "text": "Move the current block with the shift distances in x and y direction"
},
{
"type": "NarrativeText",
- "element_id": "401c342fc214105b4a45dba74c62cae0",
+ "element_id": "494d23eb529015f662df16e6da39f810",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs."
+ "text": "Scale the current block given the ratio in x and y direction"
},
{
"type": "NarrativeText",
- "element_id": "494d23eb529015f662df16e6da39f810",
+ "element_id": "d3b069f9dcc24bfac92a6de9e26f2501",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Scale the current block given the ratio in x and y direction"
+ "text": "Convert the absolute coordinates of block1 to relative coordinates to block2"
},
{
"type": "NarrativeText",
- "element_id": "ec0a5482fa70f4d98212b6b3a748003a",
+ "element_id": "bb15ecc186d598c93a1cffa30e9e1b6e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Return the union region of block1 and block2. Coordinate type to be determined based on the inputs."
+ "text": "Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates"
},
{
"type": "NarrativeText",
- "element_id": "d3b069f9dcc24bfac92a6de9e26f2501",
+ "element_id": "401c342fc214105b4a45dba74c62cae0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Convert the absolute coordinates of block1 to relative coordinates to block2"
+ "text": "Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs."
},
{
- "type": "Title",
- "element_id": "526e0087cc3f254d9f86f6c7d8e23d95",
+ "type": "NarrativeText",
+ "element_id": "ec0a5482fa70f4d98212b6b3a748003a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Description"
+ "text": "Return the union region of block1 and block2. Coordinate type to be determined based on the inputs."
},
{
- "type": "NarrativeText",
- "element_id": "bb15ecc186d598c93a1cffa30e9e1b6e",
+ "type": "Title",
+ "element_id": "7d52bf6c2abc8aebeda26c2400f00ddd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates"
+ "text": "block.crop image(image)"
},
{
- "type": "UncategorizedText",
- "element_id": "a270fb0a45b9ed73f992f73dbf0b9a3f",
+ "type": "Title",
+ "element_id": "fdf3d6c91387c02a0cdaa1ff6b3c67c5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Move the current block with the shift distances in x and y direction"
+ "text": "Obtain the image segments in the block region"
},
{
"type": "Title",
@@ -1152,34 +1152,34 @@
"text": "After the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are significantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets."
},
{
- "type": "FigureCaption",
- "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b",
+ "type": "UncategorizedText",
+ "element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 10
},
- "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance"
+ "text": "10"
},
{
- "type": "UncategorizedText",
- "element_id": "4a44dc15364204a80fe80e9039455cc1",
+ "type": "NarrativeText",
+ "element_id": "3993b330c2b3b86513c3edbcd33afc91",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 10
},
- "text": "10"
+ "text": "Z. Shen et al."
},
{
- "type": "NarrativeText",
- "element_id": "3993b330c2b3b86513c3edbcd33afc91",
+ "type": "FigureCaption",
+ "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Z. Shen et al."
+ "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance"
},
{
"type": "NarrativeText",
@@ -1363,23 +1363,23 @@
},
{
"type": "NarrativeText",
- "element_id": "9b51c55d2dd4ffd289138fc4f66e11e6",
+ "element_id": "164904dc2ff256763b3e64f1b56a784e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 12
},
- "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model."
+ "text": "To decipher the complicated layout"
},
{
"type": "NarrativeText",
- "element_id": "164904dc2ff256763b3e64f1b56a784e",
+ "element_id": "9b51c55d2dd4ffd289138fc4f66e11e6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 12
},
- "text": "To decipher the complicated layout"
+ "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model."
},
{
"type": "NarrativeText",
@@ -1393,33 +1393,33 @@
},
{
"type": "NarrativeText",
- "element_id": "069379b2abcf2bed44f13bdaea90ec2d",
+ "element_id": "07be9fda679b805e67cf5e563eada033",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 12
},
- "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR."
+ "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set."
},
{
"type": "NarrativeText",
- "element_id": "d11adbfd88959ce24fbfdc7f8155e777",
+ "element_id": "069379b2abcf2bed44f13bdaea90ec2d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 12
},
- "text": "16 This measures the overlap between the detected and ground-truth characters, and"
+ "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR."
},
{
"type": "NarrativeText",
- "element_id": "07be9fda679b805e67cf5e563eada033",
+ "element_id": "d11adbfd88959ce24fbfdc7f8155e777",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 12
},
- "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set."
+ "text": "16 This measures the overlap between the detected and ground-truth characters, and"
},
{
"type": "NarrativeText",
@@ -1643,33 +1643,33 @@
},
{
"type": "NarrativeText",
- "element_id": "ad1bf75fc53d123c878f8254f9304c9f",
+ "element_id": "44c5093519506610b07942b24d966d77",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 14
},
- "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)"
+ "text": "Hierarchical Image Database. In: CVPR09 (2009)"
},
{
"type": "NarrativeText",
- "element_id": "c6e835fe03323406543926cc0f5a94de",
+ "element_id": "ad1bf75fc53d123c878f8254f9304c9f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 14
},
- "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)"
+ "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)"
},
{
"type": "NarrativeText",
- "element_id": "44c5093519506610b07942b24d966d77",
+ "element_id": "c6e835fe03323406543926cc0f5a94de",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 14
},
- "text": "Hierarchical Image Database. In: CVPR09 (2009)"
+ "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)"
},
{
"type": "Title",
@@ -1692,164 +1692,164 @@
"text": "15"
},
{
- "type": "Title",
- "element_id": "9b9688203e9cdea89ded788342be4032",
+ "type": "UncategorizedText",
+ "element_id": "16390873ae6b6a173fc894a873bab022",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J."
+ "text": "[9]"
},
{
"type": "NarrativeText",
- "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb",
+ "element_id": "068bf90a7743f50c4a00d4827035e42f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)"
+ "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the"
},
{
"type": "NarrativeText",
- "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107",
+ "element_id": "813cac1316043d454f3c928740435736",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)"
+ "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)"
},
{
"type": "NarrativeText",
- "element_id": "be647bda3f1ca1b63554ef22d1313a43",
+ "element_id": "2f103adde52e35a8853cbb476720a6ef",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)"
+ "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)"
},
{
- "type": "NarrativeText",
- "element_id": "09cfad31b28b1315b0bc7bd219136057",
+ "type": "Title",
+ "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767"
+ "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)"
},
{
"type": "NarrativeText",
- "element_id": "80498c312fd32cb744e5953dfef18604",
+ "element_id": "124b6b55da69fccc1c06568bda34f63c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143"
+ "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)"
},
{
- "type": "NarrativeText",
- "element_id": "3e0b97d540b7b43ad61292a89a58137f",
+ "type": "Title",
+ "element_id": "9b9688203e9cdea89ded788342be4032",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)"
+ "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J."
},
{
- "type": "NarrativeText",
- "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98",
+ "type": "UncategorizedText",
+ "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)"
+ "text": "2007(159), 2 (Jul 2007)"
},
{
"type": "NarrativeText",
- "element_id": "aae12b8f70e03a3e35015ebda5974ebe",
+ "element_id": "3e0b97d540b7b43ad61292a89a58137f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)"
+ "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)"
},
{
"type": "NarrativeText",
- "element_id": "068bf90a7743f50c4a00d4827035e42f",
+ "element_id": "80498c312fd32cb744e5953dfef18604",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the"
+ "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143"
},
{
"type": "NarrativeText",
- "element_id": "813cac1316043d454f3c928740435736",
+ "element_id": "09cfad31b28b1315b0bc7bd219136057",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)"
+ "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767"
},
{
"type": "NarrativeText",
- "element_id": "124b6b55da69fccc1c06568bda34f63c",
+ "element_id": "be647bda3f1ca1b63554ef22d1313a43",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)"
+ "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)"
},
{
- "type": "UncategorizedText",
- "element_id": "16390873ae6b6a173fc894a873bab022",
+ "type": "NarrativeText",
+ "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[9]"
+ "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)"
},
{
"type": "NarrativeText",
- "element_id": "2f103adde52e35a8853cbb476720a6ef",
+ "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)"
+ "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)"
},
{
- "type": "UncategorizedText",
- "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e",
+ "type": "NarrativeText",
+ "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "2007(159), 2 (Jul 2007)"
+ "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)"
},
{
- "type": "Title",
- "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3",
+ "type": "NarrativeText",
+ "element_id": "aae12b8f70e03a3e35015ebda5974ebe",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 15
},
- "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)"
+ "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)"
},
{
"type": "UncategorizedText",
@@ -1862,84 +1862,94 @@
"text": "16"
},
{
- "type": "Title",
- "element_id": "21d399ba787aabbf69a8ca861cbcc4a3",
+ "type": "NarrativeText",
+ "element_id": "3993b330c2b3b86513c3edbcd33afc91",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:"
+ "text": "Z. Shen et al."
},
{
"type": "NarrativeText",
- "element_id": "219033258f3fff3de33bed379610c8f3",
+ "element_id": "1abcfa28cce9b0f5194dec0d534f28e5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)"
+ "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)"
},
{
"type": "NarrativeText",
- "element_id": "285ce5849d6fd9036e5d16724c024ab9",
+ "element_id": "f7c67eae65521c3a753337d08c5a7cc3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)"
+ "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)"
},
{
"type": "NarrativeText",
- "element_id": "1abcfa28cce9b0f5194dec0d534f28e5",
+ "element_id": "4f43b2e563a35ae0208a8626f7e3280e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)"
+ "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)"
},
{
- "type": "NarrativeText",
- "element_id": "f7c67eae65521c3a753337d08c5a7cc3",
+ "type": "UncategorizedText",
+ "element_id": "b66713d3f2d1689f9174e1cb87429eed",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)"
+ "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning"
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "10a3ff59f6157f21733e659a41031f83",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 16
+ },
+ "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of"
},
{
"type": "NarrativeText",
- "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2",
+ "element_id": "219033258f3fff3de33bed379610c8f3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)"
+ "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)"
},
{
"type": "NarrativeText",
- "element_id": "4f43b2e563a35ae0208a8626f7e3280e",
+ "element_id": "285ce5849d6fd9036e5d16724c024ab9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)"
+ "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)"
},
{
- "type": "UncategorizedText",
- "element_id": "b66713d3f2d1689f9174e1cb87429eed",
+ "type": "NarrativeText",
+ "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning"
+ "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)"
},
{
"type": "NarrativeText",
@@ -1972,24 +1982,24 @@
"text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)"
},
{
- "type": "UncategorizedText",
- "element_id": "10a3ff59f6157f21733e659a41031f83",
+ "type": "Title",
+ "element_id": "93d261a89a8422fb8d166e6cdf95d8f6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of"
+ "text": "github.com/facebookresearch/detectron2 (2019)"
},
{
- "type": "Title",
- "element_id": "462753569cb801c6f858759742a93793",
+ "type": "NarrativeText",
+ "element_id": "9dce913bddaa63724f5de64e539b7016",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166"
+ "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)"
},
{
"type": "Title",
@@ -2001,35 +2011,25 @@
},
"text": "text and layout for document image understanding (2019)"
},
- {
- "type": "NarrativeText",
- "element_id": "9dce913bddaa63724f5de64e539b7016",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 16
- },
- "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)"
- },
{
"type": "Title",
- "element_id": "93d261a89a8422fb8d166e6cdf95d8f6",
+ "element_id": "21d399ba787aabbf69a8ca861cbcc4a3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "github.com/facebookresearch/detectron2 (2019)"
+ "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:"
},
{
- "type": "NarrativeText",
- "element_id": "3993b330c2b3b86513c3edbcd33afc91",
+ "type": "Title",
+ "element_id": "462753569cb801c6f858759742a93793",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 16
},
- "text": "Z. Shen et al."
+ "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166"
},
{
"type": "Title",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json
index 0f1f15711f..06c384a72c 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -195,25 +195,25 @@
},
{
"type": "Title",
- "element_id": "b27e559f6c00d2bde61efba5db252e31",
+ "element_id": "1064dcef42380cfdb90c668aa3a670a3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1,
"links": []
},
- "text": "Materials engineering"
+ "text": "Table and figure"
},
{
"type": "Title",
- "element_id": "1064dcef42380cfdb90c668aa3a670a3",
+ "element_id": "b27e559f6c00d2bde61efba5db252e31",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1,
"links": []
},
- "text": "Table and figure"
+ "text": "Materials engineering"
},
{
"type": "Title",
@@ -424,6 +424,17 @@
},
"text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule"
},
+ {
+ "type": "UncategorizedText",
+ "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 2,
+ "links": []
+ },
+ "text": "30"
+ },
{
"type": "Title",
"element_id": "e28e0dc941accc8694040c63091b580c",
@@ -490,17 +501,6 @@
},
"text": "i"
},
- {
- "type": "UncategorizedText",
- "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 2,
- "links": []
- },
- "text": "30"
- },
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json
index bf9e4bf189..abcea312b5 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -265,37 +265,37 @@
"text": "Specifications table"
},
{
- "type": "NarrativeText",
- "element_id": "5c3978ebc42ea4f11240c221ac3be1cf",
+ "type": "Title",
+ "element_id": "41e0fa358cefcadbb2633ec45ff2d129",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2,
"links": []
},
- "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired"
+ "text": "Data format Experimental factors"
},
{
"type": "Title",
- "element_id": "41e0fa358cefcadbb2633ec45ff2d129",
+ "element_id": "27d70c97431a2bec06d0a89368489dfb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2,
"links": []
},
- "text": "Data format Experimental factors"
+ "text": "Experimental features Data source location Data accessibility Related research article"
},
{
- "type": "Title",
- "element_id": "27d70c97431a2bec06d0a89368489dfb",
+ "type": "NarrativeText",
+ "element_id": "5c3978ebc42ea4f11240c221ac3be1cf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2,
"links": []
},
- "text": "Experimental features Data source location Data accessibility Related research article"
+ "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired"
},
{
"type": "NarrativeText",
@@ -336,17 +336,6 @@
},
"text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the"
},
- {
- "type": "NarrativeText",
- "element_id": "7c8bc2811f71480b433eb6fee2a3bb33",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 2,
- "links": []
- },
- "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing"
- },
{
"type": "Title",
"element_id": "bd7d750cb9f652c80c17a264072b8858",
@@ -360,14 +349,14 @@
},
{
"type": "NarrativeText",
- "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb",
+ "element_id": "7c8bc2811f71480b433eb6fee2a3bb33",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2,
"links": []
},
- "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can"
+ "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing"
},
{
"type": "Title",
@@ -382,14 +371,14 @@
},
{
"type": "NarrativeText",
- "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f",
+ "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2,
"links": []
},
- "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes."
+ "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can"
},
{
"type": "NarrativeText",
@@ -402,6 +391,17 @@
},
"text": "be used for the comparison."
},
+ {
+ "type": "NarrativeText",
+ "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 2,
+ "links": []
+ },
+ "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes."
+ },
{
"type": "ListItem",
"element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa",
@@ -706,26 +706,26 @@
"text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule."
},
{
- "type": "NarrativeText",
- "element_id": "928fa0dcad70f173bc989ee5715375c5",
+ "type": "Title",
+ "element_id": "252f10c83610ebca1a059c0bae8255eb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l"
+ "text": "f"
},
{
- "type": "Title",
- "element_id": "252f10c83610ebca1a059c0bae8255eb",
+ "type": "NarrativeText",
+ "element_id": "928fa0dcad70f173bc989ee5715375c5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "f"
+ "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l"
},
{
"type": "UncategorizedText",
@@ -1152,23 +1152,6 @@
},
"text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling"
},
- {
- "type": "NarrativeText",
- "element_id": "19dee0a4e8fd073350e234b4352b8af6",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 4,
- "links": [
- {
- "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime – spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur",
- "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2",
- "start_index": 4
- }
- ]
- },
- "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur."
- },
{
"type": "UncategorizedText",
"element_id": "bec40b25a277a08de3415e33284fc76d",
@@ -1191,6 +1174,23 @@
},
"text": "problem, Networks 19 (5) (1989) 531–548."
},
+ {
+ "type": "NarrativeText",
+ "element_id": "19dee0a4e8fd073350e234b4352b8af6",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 4,
+ "links": [
+ {
+ "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime – spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur",
+ "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2",
+ "start_index": 4
+ }
+ ]
+ },
+ "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur."
+ },
{
"type": "UncategorizedText",
"element_id": "5f5ca82752a3220998c06ea0c44eb80e",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
index 5844d4e791..29af05f0b2 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json
@@ -67,36 +67,36 @@
},
{
"type": "UncategorizedText",
- "element_id": "e97f1cf1c49f397732e68cf1efb2355e",
+ "element_id": "5ce0f6dc16582eaf81312c412e99ebb9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1,
"links": []
},
- "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
+ "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford"
},
{
- "type": "NarrativeText",
- "element_id": "1252f8d8921acac5f706e4402e504a75",
+ "type": "UncategorizedText",
+ "element_id": "e97f1cf1c49f397732e68cf1efb2355e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1,
"links": []
},
- "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
+ "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
},
{
- "type": "UncategorizedText",
- "element_id": "5ce0f6dc16582eaf81312c412e99ebb9",
+ "type": "NarrativeText",
+ "element_id": "1252f8d8921acac5f706e4402e504a75",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1,
"links": []
},
- "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford"
+ "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
index 8a2764011f..b9c9aa49b9 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
@@ -132,81 +132,81 @@
"text": "Monetary policy starts to bite. Signs are apparent that monetary policy tightening is starting to cool demand and inflation, but the full impact is unlikely to be realized before 2024. Global headline inflation appears to have peaked in the third quarter of 2022 (Figure 1). Prices of fuel and nonfuel commodities have declined, lowering headline inflation, notably in the United States, the euro area, and Latin America. But underlying (core) inflation has not yet peaked in most economies and remains well above pre-pandemic levels. It has persisted amid second-round effects from earlier cost shocks and tight labor markets with robust wage growth as consumer demand has remained resilient. Medium-term inflation expectations generally remain anchored, but some gauges are up. These developments have caused central banks to raise rates faster than expected, especially in the United States and the euro area, and to signal that rates will stay elevated for longer. Core inflation is declining in some economies that have completed their tightening cycle—such as Brazil. Financial markets are displaying high sensitivity to inflation news, with equity markets rising following recent releases of lower inflation data in anticipation of interest rate cuts (Box 1), despite central banks’ communicating their resolve to tighten policy further. With the peak in US headline inflation and an acceleration in rate hikes by several non-US central banks, the dollar has weakened since September but remains significantly stronger than a year ago."
},
{
- "type": "Title",
- "element_id": "0cce65035ca66e9be782c845ddd606e2",
+ "type": "UncategorizedText",
+ "element_id": "808caaef5b114d874a25b7fec21b5516",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)"
+ "text": "18 16 14 12 10 8 6 4 2 0 –2"
},
{
"type": "UncategorizedText",
- "element_id": "808caaef5b114d874a25b7fec21b5516",
+ "element_id": "28a5aa3897d66de6c31caba99a4c337e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "18 16 14 12 10 8 6 4 2 0 –2"
+ "text": "–2"
},
{
"type": "UncategorizedText",
- "element_id": "28a5aa3897d66de6c31caba99a4c337e",
+ "element_id": "c2c7be4534a60790d1d18451c91dc138",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "–2"
+ "text": "16 14 12 10 8 6 4 2 0"
},
{
- "type": "NarrativeText",
- "element_id": "e26dceaba57a5f670d91ac170e8706d1",
+ "type": "UncategorizedText",
+ "element_id": "c7c72889cb49cf43d9bd1f892db1be2c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies."
+ "text": "Jan. 2019"
},
{
"type": "UncategorizedText",
- "element_id": "c2c7be4534a60790d1d18451c91dc138",
+ "element_id": "c7c72889cb49cf43d9bd1f892db1be2c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "16 14 12 10 8 6 4 2 0"
+ "text": "Jan. 2019"
},
{
- "type": "UncategorizedText",
- "element_id": "c7c72889cb49cf43d9bd1f892db1be2c",
+ "type": "Title",
+ "element_id": "0cce65035ca66e9be782c845ddd606e2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "Jan. 2019"
+ "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)"
},
{
- "type": "UncategorizedText",
- "element_id": "c7c72889cb49cf43d9bd1f892db1be2c",
+ "type": "NarrativeText",
+ "element_id": "e26dceaba57a5f670d91ac170e8706d1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3,
"links": []
},
- "text": "Jan. 2019"
+ "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies."
},
{
"type": "ListItem",
@@ -595,25 +595,25 @@
},
{
"type": "ListItem",
- "element_id": "afde979c99a73646915fe253c85c5a9c",
+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in"
+ "text": ""
},
{
"type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+ "element_id": "afde979c99a73646915fe253c85c5a9c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": ""
+ "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in"
},
{
"type": "NarrativeText",
@@ -628,25 +628,25 @@
},
{
"type": "ListItem",
- "element_id": "25e2f1dc031b5421b8a234945098e58b",
+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6,
"links": []
},
- "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints."
+ "text": ""
},
{
"type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+ "element_id": "25e2f1dc031b5421b8a234945098e58b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6,
"links": []
},
- "text": ""
+ "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints."
},
{
"type": "NarrativeText",
@@ -1123,25 +1123,25 @@
},
{
"type": "Title",
- "element_id": "24af2841400373443d80b6c91180918b",
+ "element_id": "e30a554d7d1cbf308651f8c267ad6872",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7,
"links": []
},
- "text": "Middle East and Central Asia"
+ "text": "Brazil Mexico"
},
{
"type": "Title",
- "element_id": "e30a554d7d1cbf308651f8c267ad6872",
+ "element_id": "24af2841400373443d80b6c91180918b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7,
"links": []
},
- "text": "Brazil Mexico"
+ "text": "Middle East and Central Asia"
},
{
"type": "Title",
@@ -1794,25 +1794,25 @@
},
{
"type": "ListItem",
- "element_id": "2d14934d52ff357c52e9ae1c38f7390e",
+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy."
+ "text": ""
},
{
"type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+ "element_id": "2d14934d52ff357c52e9ae1c38f7390e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": ""
+ "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy."
},
{
"type": "ListItem",
@@ -2012,17 +2012,6 @@
},
"text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies."
},
- {
- "type": "NarrativeText",
- "element_id": "e118be83abfed92b8969eca98bb4d53b",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 11,
- "links": []
- },
- "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives."
- },
{
"type": "Title",
"element_id": "57de33ba9eaa9e5980d4cf6da83abf46",
@@ -2288,180 +2277,191 @@
"text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report."
},
{
- "type": "Title",
- "element_id": "6ef230728534d871e5126e2a55e12b26",
+ "type": "NarrativeText",
+ "element_id": "e118be83abfed92b8969eca98bb4d53b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)"
+ "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives."
},
{
- "type": "Title",
- "element_id": "8730d3c2022abf1f9665e4ca1da43e4d",
+ "type": "UncategorizedText",
+ "element_id": "e7f6c011776e8db7cd330b54174fd76f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Latest"
+ "text": "6"
},
{
- "type": "Title",
- "element_id": "53d79cec96694df67ce3baff95d8a2e3",
+ "type": "UncategorizedText",
+ "element_id": "ef2d127de37b942baad06145e54b0c61",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "October 2022 GFSR"
+ "text": "5"
},
{
"type": "UncategorizedText",
- "element_id": "e7f6c011776e8db7cd330b54174fd76f",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "6"
+ "text": "4"
},
{
- "type": "ListItem",
- "element_id": "7d4f55875c970d850a152ba1d5ba02a5",
+ "type": "UncategorizedText",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "1. United States"
+ "text": "3"
},
{
- "type": "ListItem",
- "element_id": "8e655408cf212df5f74df13e05cdf02c",
+ "type": "UncategorizedText",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "2. Euro area"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "ef2d127de37b942baad06145e54b0c61",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "5"
+ "text": "1"
},
{
- "type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "type": "Title",
+ "element_id": "6ef230728534d871e5126e2a55e12b26",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "4"
+ "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)"
},
{
- "type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "type": "Title",
+ "element_id": "8730d3c2022abf1f9665e4ca1da43e4d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "3"
+ "text": "Latest"
},
{
- "type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "type": "Title",
+ "element_id": "53d79cec96694df67ce3baff95d8a2e3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "2"
+ "text": "October 2022 GFSR"
},
{
- "type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "type": "ListItem",
+ "element_id": "7d4f55875c970d850a152ba1d5ba02a5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "1"
+ "text": "1. United States"
},
{
- "type": "Title",
- "element_id": "49cf8421218222b21a0fc54ffce584c9",
+ "type": "ListItem",
+ "element_id": "8e655408cf212df5f74df13e05cdf02c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Oct. 22"
+ "text": "2. Euro area"
},
{
- "type": "Title",
- "element_id": "24a234895630131d612fc1b4605a256e",
+ "type": "UncategorizedText",
+ "element_id": "ef2d127de37b942baad06145e54b0c61",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Apr. 23"
+ "text": "5"
},
{
- "type": "Title",
- "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
+ "type": "UncategorizedText",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Oct. 23"
+ "text": "4"
},
{
- "type": "Title",
- "element_id": "d8478f45b9790d52201238244d0e9698",
+ "type": "UncategorizedText",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Dec. 24"
+ "text": "3"
},
{
- "type": "Title",
- "element_id": "fe1cc1c654c8a4fde402cfe2426326ef",
+ "type": "UncategorizedText",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "Dec. 26"
+ "text": "2"
+ },
+ {
+ "type": "UncategorizedText",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 11,
+ "links": []
+ },
+ "text": "1"
},
{
"type": "Title",
@@ -2519,59 +2519,59 @@
"text": "Dec. 26"
},
{
- "type": "UncategorizedText",
- "element_id": "ef2d127de37b942baad06145e54b0c61",
+ "type": "Title",
+ "element_id": "49cf8421218222b21a0fc54ffce584c9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "5"
+ "text": "Oct. 22"
},
{
- "type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "type": "Title",
+ "element_id": "24a234895630131d612fc1b4605a256e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "4"
+ "text": "Apr. 23"
},
{
- "type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "type": "Title",
+ "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "3"
+ "text": "Oct. 23"
},
{
- "type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "type": "Title",
+ "element_id": "d8478f45b9790d52201238244d0e9698",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "2"
+ "text": "Dec. 24"
},
{
- "type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "type": "Title",
+ "element_id": "fe1cc1c654c8a4fde402cfe2426326ef",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11,
"links": []
},
- "text": "1"
+ "text": "Dec. 26"
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json
index 18c9c5ac9e..2751529948 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json
@@ -1079,179 +1079,179 @@
},
{
"type": "Title",
- "element_id": "f83714d89302473e0e4f5399bd50e7a9",
+ "element_id": "3f79bb7b435b05321651daefd374cdc6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "W T"
+ "text": "e"
},
{
- "type": "Title",
- "element_id": "3f79bb7b435b05321651daefd374cdc6",
+ "type": "UncategorizedText",
+ "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "e"
+ "text": "120"
},
{
- "type": "NarrativeText",
- "element_id": "f9bb49945b60897227abdd75b5f8d39b",
+ "type": "UncategorizedText",
+ "element_id": "ad57366865126e55649ecb23ae1d4888",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "r e p s e i t i l"
+ "text": "100"
},
{
- "type": "Title",
- "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
+ "type": "UncategorizedText",
+ "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "a t a F"
+ "text": "120"
},
{
"type": "UncategorizedText",
- "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
+ "element_id": "b725d20650649a5221675144bab5946e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "120"
+ "text": "99.5"
},
{
- "type": "UncategorizedText",
- "element_id": "ad57366865126e55649ecb23ae1d4888",
+ "type": "Title",
+ "element_id": "f83714d89302473e0e4f5399bd50e7a9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "100"
+ "text": "W T"
},
{
- "type": "UncategorizedText",
- "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
+ "type": "NarrativeText",
+ "element_id": "f9bb49945b60897227abdd75b5f8d39b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "80"
+ "text": "r e p s e i t i l"
},
{
- "type": "UncategorizedText",
- "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
+ "type": "Title",
+ "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "60"
+ "text": "a t a F"
},
{
"type": "UncategorizedText",
- "element_id": "d59eced1ded07f84c145592f65bdf854",
+ "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "40"
+ "text": "80"
},
{
"type": "UncategorizedText",
- "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
+ "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "20"
+ "text": "60"
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "d59eced1ded07f84c145592f65bdf854",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "0"
+ "text": "40"
},
{
"type": "UncategorizedText",
- "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
+ "element_id": "ce3201efc2e495241a85e4fc84575f50",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "120"
+ "text": "71.9"
},
{
- "type": "Title",
- "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
+ "type": "UncategorizedText",
+ "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "C oal"
+ "text": "20"
},
{
"type": "UncategorizedText",
- "element_id": "b725d20650649a5221675144bab5946e",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "99.5"
+ "text": "0"
},
{
"type": "Title",
- "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
+ "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "Oil"
+ "text": "C oal"
},
{
- "type": "UncategorizedText",
- "element_id": "ce3201efc2e495241a85e4fc84575f50",
+ "type": "Title",
+ "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"links": []
},
- "text": "71.9"
+ "text": "Oil"
},
{
"type": "Title",
@@ -1694,59 +1694,59 @@
"text": "ren. & waste"
},
{
- "type": "Title",
- "element_id": "563a2980d46c81119e1d7d952b375a41",
+ "type": "UncategorizedText",
+ "element_id": "26d228663f13a88592a12d16cf9587ca",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9,
"links": []
},
- "text": "h W T"
+ "text": "400"
},
{
- "type": "UncategorizedText",
- "element_id": "26d228663f13a88592a12d16cf9587ca",
+ "type": "Title",
+ "element_id": "f35457739b3bd74c61625c986c844726",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9,
"links": []
},
- "text": "400"
+ "text": " Nuclear"
},
{
- "type": "UncategorizedText",
- "element_id": "983bd614bb5afece5ab3b6023f71147c",
+ "type": "Title",
+ "element_id": "f6e172956a9472fa43f9a895f99c2836",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9,
"links": []
},
- "text": "300"
+ "text": " Natural gas"
},
{
"type": "Title",
- "element_id": "f35457739b3bd74c61625c986c844726",
+ "element_id": "563a2980d46c81119e1d7d952b375a41",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9,
"links": []
},
- "text": " Nuclear"
+ "text": "h W T"
},
{
- "type": "Title",
- "element_id": "f6e172956a9472fa43f9a895f99c2836",
+ "type": "UncategorizedText",
+ "element_id": "983bd614bb5afece5ab3b6023f71147c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9,
"links": []
},
- "text": " Natural gas"
+ "text": "300"
},
{
"type": "Title",
diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json
index 2eb819dbb6..a30000b49a 100644
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json
@@ -155,36 +155,36 @@
},
{
"type": "Title",
- "element_id": "4d7c9c95f808a09f6b0bcfe8b255e537",
+ "element_id": "d977fff4c69c437aa4a44a5c5f4bf02e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii"
+ "text": "Rank Order Laypersons"
},
{
"type": "Title",
- "element_id": "d977fff4c69c437aa4a44a5c5f4bf02e",
+ "element_id": "4d7c9c95f808a09f6b0bcfe8b255e537",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Rank Order Laypersons"
+ "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii"
},
{
"type": "UncategorizedText",
- "element_id": "4523540f1504cd17100c4835e85b7eef",
+ "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "17"
+ "text": "30"
},
{
"type": "UncategorizedText",
@@ -199,36 +199,36 @@
},
{
"type": "UncategorizedText",
- "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
+ "element_id": "4523540f1504cd17100c4835e85b7eef",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "30"
+ "text": "17"
},
{
"type": "UncategorizedText",
- "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": ""
+ "text": "1"
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "4"
+ "text": ""
},
{
"type": "UncategorizedText",
@@ -243,36 +243,36 @@
},
{
"type": "UncategorizedText",
- "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": ""
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "2"
+ "text": ""
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "1"
+ "text": "2"
},
{
"type": "UncategorizedText",
@@ -287,80 +287,80 @@
},
{
"type": "Title",
- "element_id": "1656c455012b016fbac5eac0a38397bd",
+ "element_id": "eda8f72476c539920d2c0e3515ba4b07",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Electric power (non-nuclear)"
+ "text": "Smoking"
},
{
"type": "Title",
- "element_id": "602d25f25cca4ebb709f8b48f54d99d9",
+ "element_id": "2f3122790ccc9e095abe1b5ceedddf88",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Motor vehicles"
+ "text": "X-rays"
},
{
"type": "Title",
- "element_id": "eda8f72476c539920d2c0e3515ba4b07",
+ "element_id": "ed3861e631428b9b77e2bdc0384d2cbe",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Smoking"
+ "text": "Vaccinations"
},
{
"type": "Title",
- "element_id": "2f3122790ccc9e095abe1b5ceedddf88",
+ "element_id": "602d25f25cca4ebb709f8b48f54d99d9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "X-rays"
+ "text": "Motor vehicles"
},
{
"type": "Title",
- "element_id": "ed3861e631428b9b77e2bdc0384d2cbe",
+ "element_id": "82a60569029ed9032f1b08891e8524c2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Vaccinations"
+ "text": "Nuclear power"
},
{
"type": "Title",
- "element_id": "82a60569029ed9032f1b08891e8524c2",
+ "element_id": "f8e3740e358309bd0570d4f3ca141793",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Nuclear power"
+ "text": "Handguns"
},
{
"type": "Title",
- "element_id": "f8e3740e358309bd0570d4f3ca141793",
+ "element_id": "1656c455012b016fbac5eac0a38397bd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "Handguns"
+ "text": "Electric power (non-nuclear)"
},
{
"type": "Title",
@@ -408,25 +408,25 @@
},
{
"type": "UncategorizedText",
- "element_id": "7902699be42c8a8e46fbbb4501726517",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "7"
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "7902699be42c8a8e46fbbb4501726517",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "2"
+ "text": "7"
},
{
"type": "UncategorizedText",
@@ -441,36 +441,36 @@
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "19581e27de7ced00ff1ce50b2047e7a5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "4"
+ "text": "9"
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "1"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "19581e27de7ced00ff1ce50b2047e7a5",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4,
"links": []
},
- "text": "9"
+ "text": "1"
},
{
"type": "UncategorizedText",
@@ -672,36 +672,36 @@
},
{
"type": "Title",
- "element_id": "f83714d89302473e0e4f5399bd50e7a9",
+ "element_id": "3f79bb7b435b05321651daefd374cdc6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "W T"
+ "text": "e"
},
{
- "type": "Title",
- "element_id": "3f79bb7b435b05321651daefd374cdc6",
+ "type": "UncategorizedText",
+ "element_id": "e629fa6598d732768f7c726b4b621285",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "e"
+ "text": "15"
},
{
- "type": "UncategorizedText",
- "element_id": "e629fa6598d732768f7c726b4b621285",
+ "type": "Title",
+ "element_id": "f83714d89302473e0e4f5399bd50e7a9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "15"
+ "text": "W T"
},
{
"type": "NarrativeText",
@@ -715,26 +715,26 @@
"text": "r e p s e i t i l"
},
{
- "type": "Title",
- "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
+ "type": "UncategorizedText",
+ "element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "a t a F"
+ "text": "10"
},
{
- "type": "UncategorizedText",
- "element_id": "4a44dc15364204a80fe80e9039455cc1",
+ "type": "Title",
+ "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"links": []
},
- "text": "10"
+ "text": "a t a F"
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
index 6bd7f4d877..4baf9be5a6 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
@@ -685,7 +685,7 @@
},
{
"type": "Title",
- "element_id": "007b2203e9e86a49c3108e9ffd16fbbc",
+ "element_id": "babfe67b3ecc6b32db9adb9da08274bf",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -699,11 +699,11 @@
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Euro area"
+ "text": "Jan. 22"
},
{
"type": "Title",
- "element_id": "babfe67b3ecc6b32db9adb9da08274bf",
+ "element_id": "007b2203e9e86a49c3108e9ffd16fbbc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -717,7 +717,7 @@
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Jan. 22"
+ "text": "Euro area"
},
{
"type": "Title",
@@ -1279,7 +1279,7 @@
},
{
"type": "ListItem",
- "element_id": "afde979c99a73646915fe253c85c5a9c",
+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1293,11 +1293,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in"
+ "text": ""
},
{
"type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+ "element_id": "afde979c99a73646915fe253c85c5a9c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1311,7 +1311,7 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": ""
+ "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in"
},
{
"type": "UncategorizedText",
@@ -1459,7 +1459,7 @@
},
{
"type": "NarrativeText",
- "element_id": "72d289ea524eebcd8f195a8afda1c223",
+ "element_id": "d24af8f44bd419665bb4ab6efef34fed",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1473,11 +1473,11 @@
"filetype": "application/pdf",
"page_number": 6
},
- "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average."
+ "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies."
},
{
"type": "NarrativeText",
- "element_id": "d24af8f44bd419665bb4ab6efef34fed",
+ "element_id": "72d289ea524eebcd8f195a8afda1c223",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1491,7 +1491,7 @@
"filetype": "application/pdf",
"page_number": 6
},
- "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies."
+ "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average."
},
{
"type": "Title",
@@ -1602,8 +1602,8 @@
"text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)"
},
{
- "type": "Table",
- "element_id": "63bdc79def2500227001ac95d78727ab",
+ "type": "Title",
+ "element_id": "d11a1c04bd3a9891350b4bd94104df58",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1617,7 +1617,43 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45,"
+ "text": "Year over Year"
+ },
+ {
+ "type": "Title",
+ "element_id": "aa22eb2e58c7cf45c528550d68e15c51",
+ "metadata": {
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": 265756457651539296174748931590365722430,
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
+ },
+ "date_modified": "2023-02-14T07:31:28"
+ },
+ "filetype": "application/pdf",
+ "page_number": 7
+ },
+ "text": "Difference from October 2022"
+ },
+ {
+ "type": "Title",
+ "element_id": "8c327a62ae0e925498f5c68b819b32b4",
+ "metadata": {
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": 265756457651539296174748931590365722430,
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
+ },
+ "date_modified": "2023-02-14T07:31:28"
+ },
+ "filetype": "application/pdf",
+ "page_number": 7
+ },
+ "text": "Q4 over Q4 2/"
},
{
"type": "Title",
@@ -1693,7 +1729,7 @@
},
{
"type": "Title",
- "element_id": "ad1094978303f5aa32665083ee1ed934",
+ "element_id": "b2800ff802361713acee893ebae272f6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1707,11 +1743,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Latin America and the Caribbean"
+ "text": "Saudi Arabia Sub-Saharan Africa"
},
{
"type": "Title",
- "element_id": "24af2841400373443d80b6c91180918b",
+ "element_id": "6185fd66a4e106814e65c047c15dfb1f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1725,11 +1761,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Middle East and Central Asia"
+ "text": "Advanced Economies United States Euro Area"
},
{
"type": "Title",
- "element_id": "b2800ff802361713acee893ebae272f6",
+ "element_id": "24af2841400373443d80b6c91180918b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1743,11 +1779,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Saudi Arabia Sub-Saharan Africa"
+ "text": "Middle East and Central Asia"
},
{
"type": "Title",
- "element_id": "a4ca51cd6c74adf51f6e9ce60165d047",
+ "element_id": "7559320d044a32fbb21a7a8da25e9045",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1761,7 +1797,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Emerging Market and Developing Economies Emerging and Developing Asia"
+ "text": "Japan United Kingdom Canada Other Advanced Economies 3/"
},
{
"type": "Title",
@@ -1783,7 +1819,7 @@
},
{
"type": "Title",
- "element_id": "6185fd66a4e106814e65c047c15dfb1f",
+ "element_id": "ad1094978303f5aa32665083ee1ed934",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1797,7 +1833,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Advanced Economies United States Euro Area"
+ "text": "Latin America and the Caribbean"
},
{
"type": "UncategorizedText",
@@ -1819,7 +1855,7 @@
},
{
"type": "Title",
- "element_id": "7559320d044a32fbb21a7a8da25e9045",
+ "element_id": "a4ca51cd6c74adf51f6e9ce60165d047",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1833,7 +1869,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Japan United Kingdom Canada Other Advanced Economies 3/"
+ "text": "Emerging Market and Developing Economies Emerging and Developing Asia"
},
{
"type": "Title",
@@ -1855,7 +1891,7 @@
},
{
"type": "Title",
- "element_id": "05704f84f4326b5f53a04d62f7ad62fc",
+ "element_id": "e30a554d7d1cbf308651f8c267ad6872",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1869,7 +1905,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Nigeria South Africa"
+ "text": "Brazil Mexico"
},
{
"type": "Title",
@@ -1891,7 +1927,7 @@
},
{
"type": "Title",
- "element_id": "e30a554d7d1cbf308651f8c267ad6872",
+ "element_id": "18231df9f753f2eca887585247231761",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1905,11 +1941,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Brazil Mexico"
+ "text": "Germany France Italy Spain"
},
{
"type": "Title",
- "element_id": "18231df9f753f2eca887585247231761",
+ "element_id": "05704f84f4326b5f53a04d62f7ad62fc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1923,7 +1959,25 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Germany France Italy Spain"
+ "text": "Nigeria South Africa"
+ },
+ {
+ "type": "Table",
+ "element_id": "63bdc79def2500227001ac95d78727ab",
+ "metadata": {
+ "data_source": {
+ "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
+ "version": 265756457651539296174748931590365722430,
+ "record_locator": {
+ "protocol": "s3",
+ "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
+ },
+ "date_modified": "2023-02-14T07:31:28"
+ },
+ "filetype": "application/pdf",
+ "page_number": 7
+ },
+ "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45,"
},
{
"type": "UncategorizedText",
@@ -1981,7 +2035,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "9db439c530ed3425c0a68724de199942",
+ "element_id": "a7143daa9de8af6e0c465ca1354d45b6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -1995,11 +2049,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "4.7 3.1 5.9"
+ "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9"
},
{
"type": "UncategorizedText",
- "element_id": "2a9680555d457b6da4b6748492bb6f3d",
+ "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2013,11 +2067,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3"
+ "text": "6.2"
},
{
"type": "UncategorizedText",
- "element_id": "a7143daa9de8af6e0c465ca1354d45b6",
+ "element_id": "2a9680555d457b6da4b6748492bb6f3d",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2031,11 +2085,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9"
+ "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3"
},
{
"type": "UncategorizedText",
- "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f",
+ "element_id": "dbc6d298b0672b8176de90a623844b7f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2049,11 +2103,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "6.2"
+ "text": "6.0 5.5 3.8 4.1 7.0 4.1"
},
{
"type": "UncategorizedText",
- "element_id": "dbc6d298b0672b8176de90a623844b7f",
+ "element_id": "9db439c530ed3425c0a68724de199942",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2067,7 +2121,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "6.0 5.5 3.8 4.1 7.0 4.1"
+ "text": "4.7 3.1 5.9"
},
{
"type": "Title",
@@ -2125,7 +2179,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "6976f35f9f91b539b46743f37d94014a",
+ "element_id": "b7948d6976e997e76e343161b4b5d864",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2139,11 +2193,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8"
+ "text": "8.8 7.3 9.9"
},
{
"type": "UncategorizedText",
- "element_id": "743f3bc42f087068035515a8dec4f85a",
+ "element_id": "72d73db944cf6d9a5f11d6c073c1dce0",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2157,11 +2211,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.1 3.7 5.2 5.4 3.8 4.9"
+ "text": "3.4"
},
{
"type": "UncategorizedText",
- "element_id": "72d73db944cf6d9a5f11d6c073c1dce0",
+ "element_id": "e352203d837b1096ee96e1977f1c3d0b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2175,11 +2229,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.4"
+ "text": "5.4 6.6 3.4"
},
{
"type": "UncategorizedText",
- "element_id": "b7948d6976e997e76e343161b4b5d864",
+ "element_id": "743f3bc42f087068035515a8dec4f85a",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2193,11 +2247,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "8.8 7.3 9.9"
+ "text": "3.1 3.7 5.2 5.4 3.8 4.9"
},
{
"type": "UncategorizedText",
- "element_id": "e352203d837b1096ee96e1977f1c3d0b",
+ "element_id": "6976f35f9f91b539b46743f37d94014a",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2211,7 +2265,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "5.4 6.6 3.4"
+ "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8"
},
{
"type": "UncategorizedText",
@@ -2267,24 +2321,6 @@
},
"text": "1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0"
},
- {
- "type": "UncategorizedText",
- "element_id": "e7ac421147471fe341ae242e7544a44c",
- "metadata": {
- "data_source": {
- "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
- "version": 265756457651539296174748931590365722430,
- "record_locator": {
- "protocol": "s3",
- "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
- },
- "date_modified": "2023-02-14T07:31:28"
- },
- "filetype": "application/pdf",
- "page_number": 7
- },
- "text": "6.6 4.6 8.1"
- },
{
"type": "UncategorizedText",
"element_id": "1ea8f3c3db2cb6c75f21ebf26acc28a5",
@@ -2305,7 +2341,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab",
+ "element_id": "098d858ff74b2740723330ff6e43edf8",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2319,11 +2355,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "2.4 0.7 4.3 3.2 4.0 4.9"
+ "text": "2.4 2.3 2.6"
},
{
"type": "UncategorizedText",
- "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4",
+ "element_id": "e7ac421147471fe341ae242e7544a44c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2337,11 +2373,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "2.9"
+ "text": "6.6 4.6 8.1"
},
{
"type": "UncategorizedText",
- "element_id": "098d858ff74b2740723330ff6e43edf8",
+ "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2355,11 +2391,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "2.4 2.3 2.6"
+ "text": "2.4 0.7 4.3 3.2 4.0 4.9"
},
{
- "type": "Title",
- "element_id": "d11a1c04bd3a9891350b4bd94104df58",
+ "type": "UncategorizedText",
+ "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2373,7 +2409,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Year over Year"
+ "text": "2.9"
},
{
"type": "UncategorizedText",
@@ -2413,25 +2449,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "123157612cd26d61b4760a5ecd1f4bfc",
- "metadata": {
- "data_source": {
- "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
- "version": 265756457651539296174748931590365722430,
- "record_locator": {
- "protocol": "s3",
- "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
- },
- "date_modified": "2023-02-14T07:31:28"
- },
- "filetype": "application/pdf",
- "page_number": 7
- },
- "text": "2.5 1.8 4.7 3.5 4.1 5.6"
- },
- {
- "type": "UncategorizedText",
- "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387",
+ "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2445,11 +2463,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3"
+ "text": "4.3 2.6 5.5"
},
{
"type": "UncategorizedText",
- "element_id": "7fdc64e781146808df57eac112860f9b",
+ "element_id": "777e0063772d428bf1c04383b8ad058e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2463,7 +2481,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.4 2.7 4.6"
+ "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4"
},
{
"type": "UncategorizedText",
@@ -2485,7 +2503,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed",
+ "element_id": "123157612cd26d61b4760a5ecd1f4bfc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2499,11 +2517,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "4.3 2.6 5.5"
+ "text": "2.5 1.8 4.7 3.5 4.1 5.6"
},
{
"type": "UncategorizedText",
- "element_id": "777e0063772d428bf1c04383b8ad058e",
+ "element_id": "7fdc64e781146808df57eac112860f9b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2517,11 +2535,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4"
+ "text": "3.4 2.7 4.6"
},
{
- "type": "Title",
- "element_id": "aa22eb2e58c7cf45c528550d68e15c51",
+ "type": "UncategorizedText",
+ "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2535,7 +2553,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Difference from October 2022"
+ "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3"
},
{
"type": "Title",
@@ -2575,7 +2593,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "effb80722a72ecff482b7a0d4a027e78",
+ "element_id": "84bc47d0d0703878a250620230630525",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2589,11 +2607,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "0.3 0.0 –0.2 –0.4 0.4 0.0"
+ "text": "–3.3 –0.1"
},
{
"type": "UncategorizedText",
- "element_id": "d35a737537febb07f01925c873444cbc",
+ "element_id": "effb80722a72ecff482b7a0d4a027e78",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2607,7 +2625,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "–0.1 0.0 –0.3"
+ "text": "0.3 0.0 –0.2 –0.4 0.4 0.0"
},
{
"type": "UncategorizedText",
@@ -2629,7 +2647,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "84bc47d0d0703878a250620230630525",
+ "element_id": "d35a737537febb07f01925c873444cbc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2643,7 +2661,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "–3.3 –0.1"
+ "text": "–0.1 0.0 –0.3"
},
{
"type": "UncategorizedText",
@@ -2719,7 +2737,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "4d702c47ea48fa0dca98ce691995cc1b",
+ "element_id": "037023840d334f9f357a6c3da2b058ff",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2733,7 +2751,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0"
+ "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1"
},
{
"type": "UncategorizedText",
@@ -2755,7 +2773,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "037023840d334f9f357a6c3da2b058ff",
+ "element_id": "ebb1568088af8b7c7b98878b895decaf",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2769,7 +2787,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1"
+ "text": "–0.9 0.3"
},
{
"type": "UncategorizedText",
@@ -2791,7 +2809,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "ebb1568088af8b7c7b98878b895decaf",
+ "element_id": "4d702c47ea48fa0dca98ce691995cc1b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2805,7 +2823,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "–0.9 0.3"
+ "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0"
},
{
"type": "UncategorizedText",
@@ -2879,24 +2897,6 @@
},
"text": "9.2 7.8 10.4"
},
- {
- "type": "UncategorizedText",
- "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c",
- "metadata": {
- "data_source": {
- "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
- "version": 265756457651539296174748931590365722430,
- "record_locator": {
- "protocol": "s3",
- "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf"
- },
- "date_modified": "2023-02-14T07:31:28"
- },
- "filetype": "application/pdf",
- "page_number": 7
- },
- "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0"
- },
{
"type": "UncategorizedText",
"element_id": "3d5c2c97e00e0c5be2a870cf1cbaac06",
@@ -2917,7 +2917,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
+ "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2931,11 +2931,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": ". . . . . . . . ."
+ "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0"
},
{
"type": "UncategorizedText",
- "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8",
+ "element_id": "eca06fdd26e513a7b8510c8660228504",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2949,11 +2949,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4"
+ "text": "1.9"
},
{
"type": "UncategorizedText",
- "element_id": "eca06fdd26e513a7b8510c8660228504",
+ "element_id": "4d5d14d8c932363fe84036564c6c582b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2967,11 +2967,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "1.9"
+ "text": "1.7 1.8 3.7 . . . 2.5 . . ."
},
{
"type": "UncategorizedText",
- "element_id": "4d5d14d8c932363fe84036564c6c582b",
+ "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -2985,11 +2985,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "1.7 1.8 3.7 . . . 2.5 . . ."
+ "text": ". . . . . . . . ."
},
{
- "type": "Title",
- "element_id": "8c327a62ae0e925498f5c68b819b32b4",
+ "type": "UncategorizedText",
+ "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3003,7 +3003,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Q4 over Q4 2/"
+ "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4"
},
{
"type": "Title",
@@ -3097,7 +3097,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "3135d2d71bff77be4838a7102bbac5b8",
+ "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3111,11 +3111,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.2"
+ "text": ". . . . . . . . ."
},
{
"type": "UncategorizedText",
- "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
+ "element_id": "3135d2d71bff77be4838a7102bbac5b8",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3129,7 +3129,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": ". . . . . . . . ."
+ "text": "3.2"
},
{
"type": "UncategorizedText",
@@ -3187,7 +3187,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "07adb8acdd66b5d2490e542ae0604b71",
+ "element_id": "39b99440eae2f9ee75cf98100c285787",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3201,11 +3201,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8"
+ "text": "2.5 2.0 4.0 . . . 4.1 . . ."
},
{
"type": "UncategorizedText",
- "element_id": "39b99440eae2f9ee75cf98100c285787",
+ "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3219,7 +3219,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "2.5 2.0 4.0 . . . 4.1 . . ."
+ "text": ". . . . . . . . ."
},
{
"type": "UncategorizedText",
@@ -3241,7 +3241,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "a416ea84421fa7e1351582da48235bac",
+ "element_id": "1776cf91dccdf2cce268fcee416b28f6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3255,11 +3255,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "3.0"
+ "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2"
},
{
"type": "UncategorizedText",
- "element_id": "1776cf91dccdf2cce268fcee416b28f6",
+ "element_id": "07adb8acdd66b5d2490e542ae0604b71",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3273,11 +3273,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2"
+ "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8"
},
{
"type": "UncategorizedText",
- "element_id": "708c57a76a5cf81dc197cc1bd612adb2",
+ "element_id": "a416ea84421fa7e1351582da48235bac",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3291,11 +3291,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": ". . . . . . . . ."
+ "text": "3.0"
},
{
"type": "NarrativeText",
- "element_id": "df59a495ef85c5f70c5ba5356caf764a",
+ "element_id": "dd295fca8aff81058c48312a022b69b2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3309,11 +3309,11 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:"
+ "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024."
},
{
"type": "NarrativeText",
- "element_id": "dd295fca8aff81058c48312a022b69b2",
+ "element_id": "df59a495ef85c5f70c5ba5356caf764a",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3327,7 +3327,7 @@
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024."
+ "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:"
},
{
"type": "ListItem",
@@ -3510,8 +3510,8 @@
"text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital"
},
{
- "type": "ListItem",
- "element_id": "42ac57e394bf7c98d908745cefce0b80",
+ "type": "NarrativeText",
+ "element_id": "1bbcee85386321e6e8235a64d4c34d73",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3525,11 +3525,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of"
+ "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems."
},
{
- "type": "NarrativeText",
- "element_id": "1bbcee85386321e6e8235a64d4c34d73",
+ "type": "ListItem",
+ "element_id": "42ac57e394bf7c98d908745cefce0b80",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3543,7 +3543,7 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems."
+ "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of"
},
{
"type": "NarrativeText",
@@ -3565,7 +3565,7 @@
},
{
"type": "ListItem",
- "element_id": "2d14934d52ff357c52e9ae1c38f7390e",
+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3579,11 +3579,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy."
+ "text": ""
},
{
"type": "ListItem",
- "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+ "element_id": "2d14934d52ff357c52e9ae1c38f7390e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3597,7 +3597,7 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": ""
+ "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy."
},
{
"type": "ListItem",
@@ -3960,8 +3960,8 @@
"text": "Strengthening multilateral cooperation—Urgent action is needed to limit the risks stemming from geopolitical fragmentation and to ensure cooperation on fundamental areas of common interest:"
},
{
- "type": "NarrativeText",
- "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b",
+ "type": "ListItem",
+ "element_id": "bd7674df887463bc9f05c8030a151dea",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3975,11 +3975,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes."
+ "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global"
},
{
- "type": "ListItem",
- "element_id": "bd7674df887463bc9f05c8030a151dea",
+ "type": "NarrativeText",
+ "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -3993,7 +3993,7 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global"
+ "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes."
},
{
"type": "ListItem",
@@ -4014,8 +4014,8 @@
"text": "Strengthening global trade: Strengthening the global trading system would address risks associated"
},
{
- "type": "NarrativeText",
- "element_id": "e6f343736720ae4f9bf5202294c7c9fc",
+ "type": "Title",
+ "element_id": "0695b563acde461fc2f8d9aebccf35c7",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4029,11 +4029,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system."
+ "text": "with"
},
{
- "type": "Title",
- "element_id": "0695b563acde461fc2f8d9aebccf35c7",
+ "type": "NarrativeText",
+ "element_id": "e6f343736720ae4f9bf5202294c7c9fc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4047,7 +4047,7 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "with"
+ "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system."
},
{
"type": "ListItem",
@@ -4194,8 +4194,8 @@
"text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies."
},
{
- "type": "NarrativeText",
- "element_id": "261bebc8fb9b3ed5146d23644639bc26",
+ "type": "UncategorizedText",
+ "element_id": "a43f5d32a34c9b54fe96097c3d491389",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4209,11 +4209,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked."
+ "text": "–3"
},
{
- "type": "NarrativeText",
- "element_id": "e118be83abfed92b8969eca98bb4d53b",
+ "type": "UncategorizedText",
+ "element_id": "28a5aa3897d66de6c31caba99a4c337e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4227,11 +4227,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives."
+ "text": "–2"
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "467792e5d9b6bec26f556875e9ccab10",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4245,11 +4245,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "1"
+ "text": "–1"
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4263,11 +4263,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "2"
+ "text": "1"
},
{
"type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4281,11 +4281,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "3"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4299,11 +4299,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "4"
+ "text": "3"
},
{
"type": "UncategorizedText",
- "element_id": "ef2d127de37b942baad06145e54b0c61",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4317,11 +4317,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "5"
+ "text": "0"
},
{
"type": "UncategorizedText",
- "element_id": "e7f6c011776e8db7cd330b54174fd76f",
+ "element_id": "ef2d127de37b942baad06145e54b0c61",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4335,11 +4335,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "6"
+ "text": "5"
},
{
- "type": "Title",
- "element_id": "6ef230728534d871e5126e2a55e12b26",
+ "type": "UncategorizedText",
+ "element_id": "e7f6c011776e8db7cd330b54174fd76f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4353,11 +4353,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)"
+ "text": "6"
},
{
- "type": "Title",
- "element_id": "57de33ba9eaa9e5980d4cf6da83abf46",
+ "type": "UncategorizedText",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4371,11 +4371,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)"
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "467792e5d9b6bec26f556875e9ccab10",
+ "element_id": "7902699be42c8a8e46fbbb4501726517",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4389,11 +4389,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "–1"
+ "text": "7"
},
{
"type": "UncategorizedText",
- "element_id": "28a5aa3897d66de6c31caba99a4c337e",
+ "element_id": "4108466a9a52ce87e39eb1836a42f6f2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4407,11 +4407,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "–2"
+ "text": "2006 08 08"
},
{
- "type": "UncategorizedText",
- "element_id": "a43f5d32a34c9b54fe96097c3d491389",
+ "type": "Title",
+ "element_id": "57de33ba9eaa9e5980d4cf6da83abf46",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4425,7 +4425,7 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "–3"
+ "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)"
},
{
"type": "NarrativeText",
@@ -4446,8 +4446,8 @@
"text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report."
},
{
- "type": "Title",
- "element_id": "49cf8421218222b21a0fc54ffce584c9",
+ "type": "UncategorizedText",
+ "element_id": "aacd834b5cdc64a329e27649143406dd",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4461,11 +4461,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Oct. 22"
+ "text": "06"
},
{
"type": "UncategorizedText",
- "element_id": "7902699be42c8a8e46fbbb4501726517",
+ "element_id": "785329d8f1c63e8d0cdeedba9e6bc2ea",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4479,11 +4479,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "7"
+ "text": "10 10"
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "1e46bf7c5134da75e3a2aae852d7bddf",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4497,11 +4497,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "2"
+ "text": "12 12"
},
{
- "type": "UncategorizedText",
- "element_id": "e7f6c011776e8db7cd330b54174fd76f",
+ "type": "Title",
+ "element_id": "4255f2d53f6408c450b02b249d53c220",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4515,11 +4515,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "6"
+ "text": "United States Euro area China Other AEs Other EMs"
},
{
"type": "UncategorizedText",
- "element_id": "ef2d127de37b942baad06145e54b0c61",
+ "element_id": "c81a1234a265c680bbc9e96e73073acd",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4533,11 +4533,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "5"
+ "text": "14 16 14"
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "b17ef6d19c7a5b1ee83b907c595526dc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4551,11 +4551,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "4"
+ "text": "16"
},
{
"type": "UncategorizedText",
- "element_id": "4108466a9a52ce87e39eb1836a42f6f2",
+ "element_id": "99cb7a0185216a0acb0ed918e7058868",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4569,11 +4569,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "2006 08 08"
+ "text": "18 18"
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "0c5e98c11d7bb005adbaf731ebfbbb2c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4587,11 +4587,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "0"
+ "text": "20 22 22"
},
{
"type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4605,11 +4605,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "3"
+ "text": "20"
},
{
- "type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "type": "Title",
+ "element_id": "53d79cec96694df67ce3baff95d8a2e3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4623,11 +4623,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "1"
+ "text": "October 2022 GFSR"
},
{
- "type": "UncategorizedText",
- "element_id": "aacd834b5cdc64a329e27649143406dd",
+ "type": "NarrativeText",
+ "element_id": "e118be83abfed92b8969eca98bb4d53b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4641,11 +4641,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "06"
+ "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives."
},
{
- "type": "Title",
- "element_id": "24a234895630131d612fc1b4605a256e",
+ "type": "NarrativeText",
+ "element_id": "261bebc8fb9b3ed5146d23644639bc26",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4659,11 +4659,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Apr. 23"
+ "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked."
},
{
- "type": "ListItem",
- "element_id": "7d4f55875c970d850a152ba1d5ba02a5",
+ "type": "UncategorizedText",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4677,11 +4677,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "1. United States"
+ "text": "3"
},
{
- "type": "Title",
- "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
+ "type": "UncategorizedText",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4695,11 +4695,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Oct. 23"
+ "text": "1"
},
{
- "type": "Title",
- "element_id": "8730d3c2022abf1f9665e4ca1da43e4d",
+ "type": "UncategorizedText",
+ "element_id": "e7f6c011776e8db7cd330b54174fd76f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4713,11 +4713,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Latest"
+ "text": "6"
},
{
"type": "UncategorizedText",
- "element_id": "785329d8f1c63e8d0cdeedba9e6bc2ea",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4731,11 +4731,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "10 10"
+ "text": "2"
},
{
- "type": "Title",
- "element_id": "d8478f45b9790d52201238244d0e9698",
+ "type": "UncategorizedText",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4749,11 +4749,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Dec. 24"
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "1e46bf7c5134da75e3a2aae852d7bddf",
+ "element_id": "ef2d127de37b942baad06145e54b0c61",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4767,11 +4767,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "12 12"
+ "text": "5"
},
{
"type": "Title",
- "element_id": "fe1cc1c654c8a4fde402cfe2426326ef",
+ "element_id": "49cf8421218222b21a0fc54ffce584c9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4785,11 +4785,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Dec. 26"
+ "text": "Oct. 22"
},
{
"type": "Title",
- "element_id": "4255f2d53f6408c450b02b249d53c220",
+ "element_id": "6ef230728534d871e5126e2a55e12b26",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4803,11 +4803,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "United States Euro area China Other AEs Other EMs"
+ "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)"
},
{
- "type": "UncategorizedText",
- "element_id": "c81a1234a265c680bbc9e96e73073acd",
+ "type": "Title",
+ "element_id": "24a234895630131d612fc1b4605a256e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4821,11 +4821,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "14 16 14"
+ "text": "Apr. 23"
},
{
- "type": "Title",
- "element_id": "49cf8421218222b21a0fc54ffce584c9",
+ "type": "ListItem",
+ "element_id": "7d4f55875c970d850a152ba1d5ba02a5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4839,11 +4839,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Oct. 22"
+ "text": "1. United States"
},
{
"type": "Title",
- "element_id": "53d79cec96694df67ce3baff95d8a2e3",
+ "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4857,11 +4857,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "October 2022 GFSR"
+ "text": "Oct. 23"
},
{
- "type": "ListItem",
- "element_id": "8e655408cf212df5f74df13e05cdf02c",
+ "type": "Title",
+ "element_id": "8730d3c2022abf1f9665e4ca1da43e4d",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4875,11 +4875,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "2. Euro area"
+ "text": "Latest"
},
{
- "type": "UncategorizedText",
- "element_id": "b17ef6d19c7a5b1ee83b907c595526dc",
+ "type": "Title",
+ "element_id": "d8478f45b9790d52201238244d0e9698",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4893,11 +4893,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "16"
+ "text": "Dec. 24"
},
{
"type": "Title",
- "element_id": "24a234895630131d612fc1b4605a256e",
+ "element_id": "fe1cc1c654c8a4fde402cfe2426326ef",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4911,11 +4911,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Apr. 23"
+ "text": "Dec. 26"
},
{
- "type": "UncategorizedText",
- "element_id": "99cb7a0185216a0acb0ed918e7058868",
+ "type": "Title",
+ "element_id": "49cf8421218222b21a0fc54ffce584c9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4929,11 +4929,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "18 18"
+ "text": "Oct. 22"
},
{
"type": "Title",
- "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
+ "element_id": "53d79cec96694df67ce3baff95d8a2e3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4947,11 +4947,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Oct. 23"
+ "text": "October 2022 GFSR"
},
{
- "type": "UncategorizedText",
- "element_id": "0c5e98c11d7bb005adbaf731ebfbbb2c",
+ "type": "ListItem",
+ "element_id": "8e655408cf212df5f74df13e05cdf02c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4965,11 +4965,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "20 22 22"
+ "text": "2. Euro area"
},
{
"type": "Title",
- "element_id": "d8478f45b9790d52201238244d0e9698",
+ "element_id": "24a234895630131d612fc1b4605a256e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -4983,11 +4983,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "Dec. 24"
+ "text": "Apr. 23"
},
{
- "type": "UncategorizedText",
- "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
+ "type": "Title",
+ "element_id": "914e31edcbd035dbe9f1cfb7b29089a9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5001,11 +5001,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "20"
+ "text": "Oct. 23"
},
{
"type": "Title",
- "element_id": "53d79cec96694df67ce3baff95d8a2e3",
+ "element_id": "d8478f45b9790d52201238244d0e9698",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5019,7 +5019,7 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "October 2022 GFSR"
+ "text": "Dec. 24"
},
{
"type": "Title",
@@ -5041,7 +5041,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "ef2d127de37b942baad06145e54b0c61",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5055,11 +5055,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "2"
+ "text": "5"
},
{
"type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5073,11 +5073,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "3"
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5091,11 +5091,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "4"
+ "text": "3"
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5109,11 +5109,11 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "1"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "ef2d127de37b942baad06145e54b0c61",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf",
@@ -5127,7 +5127,7 @@
"filetype": "application/pdf",
"page_number": 11
},
- "text": "5"
+ "text": "1"
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json
index b5153e745f..ef0bcde4d8 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json
@@ -199,7 +199,7 @@
},
{
"type": "ListItem",
- "element_id": "3cc3e847449fed4fa13bbd94f86e43a9",
+ "element_id": "9c4387f669c689e9af0a712fd494b2d7",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -213,11 +213,11 @@
"filetype": "application/pdf",
"page_number": 3
},
- "text": "The need to create a level playing field that values reliability and energy security"
+ "text": "The need for harmony in the nuclear regulatory environment"
},
{
"type": "ListItem",
- "element_id": "9c4387f669c689e9af0a712fd494b2d7",
+ "element_id": "93e7dedc9d334470067ad2de1f9ee788",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -231,11 +231,11 @@
"filetype": "application/pdf",
"page_number": 3
},
- "text": "The need for harmony in the nuclear regulatory environment"
+ "text": "The need for a holistic safety paradigm for the whole electricity system."
},
{
"type": "ListItem",
- "element_id": "93e7dedc9d334470067ad2de1f9ee788",
+ "element_id": "3cc3e847449fed4fa13bbd94f86e43a9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -249,7 +249,7 @@
"filetype": "application/pdf",
"page_number": 3
},
- "text": "The need for a holistic safety paradigm for the whole electricity system."
+ "text": "The need to create a level playing field that values reliability and energy security"
},
{
"type": "UncategorizedText",
@@ -342,8 +342,8 @@
"text": " Marine"
},
{
- "type": "Title",
- "element_id": "563a2980d46c81119e1d7d952b375a41",
+ "type": "UncategorizedText",
+ "element_id": "9925953f1faef050547e5f7b811c3f7d",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -357,11 +357,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "h W T"
+ "text": "40,000"
},
{
- "type": "UncategorizedText",
- "element_id": "9925953f1faef050547e5f7b811c3f7d",
+ "type": "Title",
+ "element_id": "d04999bf99ea28fc8a6b20318caac58c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -375,11 +375,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "40,000"
+ "text": " CSP"
},
{
- "type": "UncategorizedText",
- "element_id": "4ebe55cc1aee6dd892d7182d797d105a",
+ "type": "Title",
+ "element_id": "563a2980d46c81119e1d7d952b375a41",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -393,11 +393,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "35,000"
+ "text": "h W T"
},
{
"type": "UncategorizedText",
- "element_id": "422f240e43a3226f329ba4a0236f587c",
+ "element_id": "4ebe55cc1aee6dd892d7182d797d105a",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -411,11 +411,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "30,000"
+ "text": "35,000"
},
{
"type": "UncategorizedText",
- "element_id": "c7e6673590d2426f635c9be70bd8f057",
+ "element_id": "422f240e43a3226f329ba4a0236f587c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -429,11 +429,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "25,000"
+ "text": "30,000"
},
{
"type": "UncategorizedText",
- "element_id": "b6b53b7d4224992f9aa86411bbc3f74b",
+ "element_id": "c7e6673590d2426f635c9be70bd8f057",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -447,11 +447,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "20,000"
+ "text": "25,000"
},
{
"type": "UncategorizedText",
- "element_id": "b2ee3509c1fa4f9741f894e592bda9ac",
+ "element_id": "b6b53b7d4224992f9aa86411bbc3f74b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -465,11 +465,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "15,000"
+ "text": "20,000"
},
{
"type": "UncategorizedText",
- "element_id": "28ec039832f5bc96c2be0eaee016dafe",
+ "element_id": "b2ee3509c1fa4f9741f894e592bda9ac",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -483,11 +483,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "10,000"
+ "text": "15,000"
},
{
"type": "UncategorizedText",
- "element_id": "b2008c37ee3a7cf7ba87f5ad50dd9e11",
+ "element_id": "28ec039832f5bc96c2be0eaee016dafe",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -501,11 +501,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "5,000"
+ "text": "10,000"
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "b2008c37ee3a7cf7ba87f5ad50dd9e11",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -519,7 +519,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0"
+ "text": "5,000"
},
{
"type": "Title",
@@ -648,8 +648,8 @@
"text": "__"
},
{
- "type": "UncategorizedText",
- "element_id": "81a83544cf93c245178cbc1620030f11",
+ "type": "Title",
+ "element_id": "8af26217282646d0f64d3e3211f47512",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -663,11 +663,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2000"
+ "text": " Solar PV"
},
{
- "type": "UncategorizedText",
- "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37",
+ "type": "Title",
+ "element_id": "6e28663850f2b50ee6af2d4477b410be",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -681,11 +681,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2010"
+ "text": " Geothermal"
},
{
- "type": "UncategorizedText",
- "element_id": "73a2af8864fc500fa49048bf3003776c",
+ "type": "Title",
+ "element_id": "7e2f430d44cfb03dca12ffde615c36ec",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -699,11 +699,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2020"
+ "text": " Wind"
},
{
- "type": "UncategorizedText",
- "element_id": "8e1f192fe25ad49be764c3f55c68beb3",
+ "type": "Title",
+ "element_id": "bde9df80639b681edb85ace46b4d4600",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -717,11 +717,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2030"
+ "text": " Bioenergy"
},
{
- "type": "UncategorizedText",
- "element_id": "df34d853f2f2f1f14b92359f695426dc",
+ "type": "Title",
+ "element_id": "b449cd843dc44ab907e1e9ed9c30d92e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -735,11 +735,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2040"
+ "text": " Hydro"
},
{
"type": "Title",
- "element_id": "d04999bf99ea28fc8a6b20318caac58c",
+ "element_id": "f35457739b3bd74c61625c986c844726",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -753,11 +753,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " CSP"
+ "text": " Nuclear"
},
{
"type": "Title",
- "element_id": "8af26217282646d0f64d3e3211f47512",
+ "element_id": "0f3341ae76e0d4d7816d3620bd915110",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -771,11 +771,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Solar PV"
+ "text": " Gas"
},
{
"type": "Title",
- "element_id": "6e28663850f2b50ee6af2d4477b410be",
+ "element_id": "b001a2374d44e3085e712bb40f66270e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -789,11 +789,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Geothermal"
+ "text": " Oil"
},
{
"type": "Title",
- "element_id": "7e2f430d44cfb03dca12ffde615c36ec",
+ "element_id": "90ad0c8c14253135efd14645e0156145",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -807,11 +807,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Wind"
+ "text": " Coal"
},
{
- "type": "Title",
- "element_id": "bde9df80639b681edb85ace46b4d4600",
+ "type": "UncategorizedText",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -825,11 +825,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Bioenergy"
+ "text": "0"
},
{
- "type": "Title",
- "element_id": "b449cd843dc44ab907e1e9ed9c30d92e",
+ "type": "UncategorizedText",
+ "element_id": "81a83544cf93c245178cbc1620030f11",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -843,11 +843,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Hydro"
+ "text": "2000"
},
{
- "type": "Title",
- "element_id": "f35457739b3bd74c61625c986c844726",
+ "type": "UncategorizedText",
+ "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -861,11 +861,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Nuclear"
+ "text": "2010"
},
{
- "type": "Title",
- "element_id": "0f3341ae76e0d4d7816d3620bd915110",
+ "type": "UncategorizedText",
+ "element_id": "73a2af8864fc500fa49048bf3003776c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -879,11 +879,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Gas"
+ "text": "2020"
},
{
- "type": "Title",
- "element_id": "b001a2374d44e3085e712bb40f66270e",
+ "type": "UncategorizedText",
+ "element_id": "8e1f192fe25ad49be764c3f55c68beb3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -897,11 +897,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Oil"
+ "text": "2030"
},
{
- "type": "Title",
- "element_id": "90ad0c8c14253135efd14645e0156145",
+ "type": "UncategorizedText",
+ "element_id": "df34d853f2f2f1f14b92359f695426dc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -915,7 +915,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " Coal"
+ "text": "2040"
},
{
"type": "NarrativeText",
@@ -972,8 +972,8 @@
"text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear – instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall."
},
{
- "type": "Title",
- "element_id": "a5d60fc4dbbd484074d8389c35703cf7",
+ "type": "UncategorizedText",
+ "element_id": "ebc18f485dc347b842b3d248d011ce6c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -987,11 +987,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "h W G"
+ "text": "30,000,000"
},
{
- "type": "UncategorizedText",
- "element_id": "ebc18f485dc347b842b3d248d011ce6c",
+ "type": "Title",
+ "element_id": "a5d60fc4dbbd484074d8389c35703cf7",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1005,7 +1005,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "30,000,000"
+ "text": "h W G"
},
{
"type": "UncategorizedText",
@@ -1098,8 +1098,8 @@
"text": "5,000,000"
},
{
- "type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "type": "Title",
+ "element_id": "e3cf3e34001852adb7a17cf424bda9fc",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1113,11 +1113,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "0"
+ "text": " High-carbon Low-carbon"
},
{
- "type": "Title",
- "element_id": "e3cf3e34001852adb7a17cf424bda9fc",
+ "type": "UncategorizedText",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1131,7 +1131,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": " High-carbon Low-carbon"
+ "text": "0"
},
{
"type": "UncategorizedText",
@@ -1908,8 +1908,8 @@
"text": "140"
},
{
- "type": "Title",
- "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
+ "type": "NarrativeText",
+ "element_id": "e11247712b3df61756970b45f019ad68",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1923,11 +1923,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "a t a F"
+ "text": "r a e y"
},
{
- "type": "NarrativeText",
- "element_id": "e11247712b3df61756970b45f019ad68",
+ "type": "Title",
+ "element_id": "3f79bb7b435b05321651daefd374cdc6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1941,11 +1941,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "r a e y"
+ "text": "e"
},
{
- "type": "Title",
- "element_id": "f83714d89302473e0e4f5399bd50e7a9",
+ "type": "UncategorizedText",
+ "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1959,11 +1959,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "W T"
+ "text": "120"
},
{
- "type": "NarrativeText",
- "element_id": "f9bb49945b60897227abdd75b5f8d39b",
+ "type": "UncategorizedText",
+ "element_id": "ad57366865126e55649ecb23ae1d4888",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1977,11 +1977,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "r e p s e i t i l"
+ "text": "100"
},
{
"type": "UncategorizedText",
- "element_id": "380918b946a526640a40df5dced65167",
+ "element_id": "5bddd069fd77ec5699d9ab00c00f47c4",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -1995,11 +1995,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "="
+ "text": "1 :"
},
{
"type": "UncategorizedText",
- "element_id": "911bc18af1665a604b4fa4a97d47f477",
+ "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2013,11 +2013,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "“99 :"
+ "text": "120"
},
{
- "type": "Title",
- "element_id": "3f79bb7b435b05321651daefd374cdc6",
+ "type": "UncategorizedText",
+ "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2031,11 +2031,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "e"
+ "text": ":"
},
{
"type": "UncategorizedText",
- "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
+ "element_id": "b725d20650649a5221675144bab5946e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2049,11 +2049,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "120"
+ "text": "99.5"
},
{
- "type": "UncategorizedText",
- "element_id": "ad57366865126e55649ecb23ae1d4888",
+ "type": "Title",
+ "element_id": "f83714d89302473e0e4f5399bd50e7a9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2067,11 +2067,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "100"
+ "text": "W T"
},
{
"type": "UncategorizedText",
- "element_id": "d59eced1ded07f84c145592f65bdf854",
+ "element_id": "380918b946a526640a40df5dced65167",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2085,11 +2085,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "40"
+ "text": "="
},
{
- "type": "UncategorizedText",
- "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
+ "type": "NarrativeText",
+ "element_id": "f9bb49945b60897227abdd75b5f8d39b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2103,11 +2103,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "60"
+ "text": "r e p s e i t i l"
},
{
"type": "UncategorizedText",
- "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
+ "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2121,11 +2121,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "20"
+ "text": "80"
},
{
"type": "UncategorizedText",
- "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
+ "element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2139,11 +2139,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "80"
+ "text": "60"
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "ce3201efc2e495241a85e4fc84575f50",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2157,11 +2157,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "0"
+ "text": "71.9"
},
{
"type": "UncategorizedText",
- "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2175,11 +2175,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": ":"
+ "text": "1"
},
{
- "type": "UncategorizedText",
- "element_id": "5bddd069fd77ec5699d9ab00c00f47c4",
+ "type": "Title",
+ "element_id": "1b16b1df538ba12dc3f97edbb85caa70",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2193,11 +2193,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "1 :"
+ "text": "n"
},
{
- "type": "Title",
- "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
+ "type": "UncategorizedText",
+ "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2211,11 +2211,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "C oal"
+ "text": "."
},
{
"type": "UncategorizedText",
- "element_id": "2abaca4911e68fa9bfbf3482ee797fd5",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2229,11 +2229,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "120"
+ "text": "1"
},
{
- "type": "UncategorizedText",
- "element_id": "b725d20650649a5221675144bab5946e",
+ "type": "Title",
+ "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2247,11 +2247,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "99.5"
+ "text": "a t a F"
},
{
- "type": "Title",
- "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
+ "type": "UncategorizedText",
+ "element_id": "d59eced1ded07f84c145592f65bdf854",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2265,11 +2265,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Oil"
+ "text": "40"
},
{
- "type": "Title",
- "element_id": "4fabb98454d019811a732c4a09f31bf0",
+ "type": "UncategorizedText",
+ "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2283,11 +2283,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "N atural gas"
+ "text": ":"
},
{
"type": "UncategorizedText",
- "element_id": "ce3201efc2e495241a85e4fc84575f50",
+ "element_id": "911bc18af1665a604b4fa4a97d47f477",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2301,11 +2301,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "71.9"
+ "text": "“99 :"
},
{
- "type": "Title",
- "element_id": "593cbe414f10662e62c0da03ce3302b8",
+ "type": "UncategorizedText",
+ "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2319,11 +2319,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "fe)"
+ "text": "20"
},
{
- "type": "Title",
- "element_id": "77cf83b127020f3a465005abc747e63f",
+ "type": "UncategorizedText",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2337,11 +2337,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "Offshore wind"
+ "text": "0"
},
{
- "type": "UncategorizedText",
- "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6",
+ "type": "Title",
+ "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2355,11 +2355,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": ":"
+ "text": "C oal"
},
{
- "type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "type": "Title",
+ "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2373,11 +2373,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "1"
+ "text": "Oil"
},
{
"type": "Title",
- "element_id": "1b16b1df538ba12dc3f97edbb85caa70",
+ "element_id": "4fabb98454d019811a732c4a09f31bf0",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2391,11 +2391,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "n"
+ "text": "N atural gas"
},
{
- "type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "type": "Title",
+ "element_id": "593cbe414f10662e62c0da03ce3302b8",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2409,11 +2409,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "1"
+ "text": "fe)"
},
{
- "type": "UncategorizedText",
- "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa",
+ "type": "Title",
+ "element_id": "77cf83b127020f3a465005abc747e63f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2427,7 +2427,7 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "."
+ "text": "Offshore wind"
},
{
"type": "UncategorizedText",
@@ -2646,8 +2646,8 @@
"text": "100"
},
{
- "type": "UncategorizedText",
- "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63",
+ "type": "Title",
+ "element_id": "90ad0c8c14253135efd14645e0156145",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2661,11 +2661,11 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": "90"
+ "text": " Coal"
},
{
- "type": "Title",
- "element_id": "90ad0c8c14253135efd14645e0156145",
+ "type": "UncategorizedText",
+ "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -2679,7 +2679,7 @@
"filetype": "application/pdf",
"page_number": 8
},
- "text": " Coal"
+ "text": "90"
},
{
"type": "Title",
@@ -3187,7 +3187,7 @@
},
{
"type": "Title",
- "element_id": "563a2980d46c81119e1d7d952b375a41",
+ "element_id": "f6e172956a9472fa43f9a895f99c2836",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3201,11 +3201,11 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "h W T"
+ "text": " Natural gas"
},
{
- "type": "UncategorizedText",
- "element_id": "983bd614bb5afece5ab3b6023f71147c",
+ "type": "Title",
+ "element_id": "563a2980d46c81119e1d7d952b375a41",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3219,11 +3219,11 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "300"
+ "text": "h W T"
},
{
"type": "UncategorizedText",
- "element_id": "27badc983df1780b60c2b3fa9d3a19a0",
+ "element_id": "983bd614bb5afece5ab3b6023f71147c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3237,11 +3237,11 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "200"
+ "text": "300"
},
{
"type": "UncategorizedText",
- "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9",
+ "element_id": "27badc983df1780b60c2b3fa9d3a19a0",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3255,11 +3255,11 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "y ——"
+ "text": "200"
},
{
- "type": "ListItem",
- "element_id": "bda050585a00f0f6cb502350559d7553",
+ "type": "UncategorizedText",
+ "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3273,7 +3273,7 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "—"
+ "text": "y ——"
},
{
"type": "ListItem",
@@ -3293,24 +3293,6 @@
},
"text": "—"
},
- {
- "type": "Title",
- "element_id": "f6e172956a9472fa43f9a895f99c2836",
- "metadata": {
- "data_source": {
- "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
- "version": 177372694731575984083482917563244941766,
- "record_locator": {
- "protocol": "s3",
- "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf"
- },
- "date_modified": "2023-02-12T10:10:36"
- },
- "filetype": "application/pdf",
- "page_number": 9
- },
- "text": " Natural gas"
- },
{
"type": "Title",
"element_id": "b449cd843dc44ab907e1e9ed9c30d92e",
@@ -3474,8 +3456,8 @@
"text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x"
},
{
- "type": "FigureCaption",
- "element_id": "eeda9f9210dfe4be7e82b4385290d3ca",
+ "type": "NarrativeText",
+ "element_id": "4f5cc927b953f3c49c562a22c88f863f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3489,11 +3471,11 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "One fuel pellet contains as much energy as a tonne of coal"
+ "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times."
},
{
- "type": "NarrativeText",
- "element_id": "4f5cc927b953f3c49c562a22c88f863f",
+ "type": "FigureCaption",
+ "element_id": "eeda9f9210dfe4be7e82b4385290d3ca",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf",
@@ -3507,7 +3489,7 @@
"filetype": "application/pdf",
"page_number": 9
},
- "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times."
+ "text": "One fuel pellet contains as much energy as a tonne of coal"
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
index a2f18f4d0d..a0b176312b 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
@@ -307,7 +307,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "4523540f1504cd17100c4835e85b7eef",
+ "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -321,7 +321,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "17"
+ "text": "30"
},
{
"type": "UncategorizedText",
@@ -343,7 +343,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
+ "element_id": "4523540f1504cd17100c4835e85b7eef",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -357,7 +357,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "30"
+ "text": "17"
},
{
"type": "UncategorizedText",
@@ -415,7 +415,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -429,11 +429,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2"
+ "text": "1"
},
{
"type": "UncategorizedText",
- "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -447,11 +447,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "3"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "4e07408562bedb8b60ce05c1decfe3ad",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -465,7 +465,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "1"
+ "text": "3"
},
{
"type": "UncategorizedText",
@@ -505,7 +505,7 @@
},
{
"type": "Title",
- "element_id": "1656c455012b016fbac5eac0a38397bd",
+ "element_id": "eda8f72476c539920d2c0e3515ba4b07",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -519,11 +519,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Electric power (non-nuclear)"
+ "text": "Smoking"
},
{
"type": "Title",
- "element_id": "ed3861e631428b9b77e2bdc0384d2cbe",
+ "element_id": "f8e3740e358309bd0570d4f3ca141793",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -537,11 +537,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Vaccinations"
+ "text": "Handguns"
},
{
"type": "Title",
- "element_id": "eda8f72476c539920d2c0e3515ba4b07",
+ "element_id": "ed3861e631428b9b77e2bdc0384d2cbe",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -555,11 +555,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Smoking"
+ "text": "Vaccinations"
},
{
"type": "Title",
- "element_id": "f8e3740e358309bd0570d4f3ca141793",
+ "element_id": "602d25f25cca4ebb709f8b48f54d99d9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -573,11 +573,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Handguns"
+ "text": "Motor vehicles"
},
{
"type": "Title",
- "element_id": "602d25f25cca4ebb709f8b48f54d99d9",
+ "element_id": "82a60569029ed9032f1b08891e8524c2",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -591,11 +591,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Motor vehicles"
+ "text": "Nuclear power"
},
{
"type": "Title",
- "element_id": "82a60569029ed9032f1b08891e8524c2",
+ "element_id": "1656c455012b016fbac5eac0a38397bd",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -609,7 +609,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Nuclear power"
+ "text": "Electric power (non-nuclear)"
},
{
"type": "Title",
@@ -703,7 +703,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
+ "element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -717,11 +717,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": ""
+ "text": "4"
},
{
"type": "UncategorizedText",
- "element_id": "4b227777d4dd1fc61c6f884f48641d02",
+ "element_id": "19581e27de7ced00ff1ce50b2047e7a5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -735,11 +735,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "4"
+ "text": "9"
},
{
"type": "UncategorizedText",
- "element_id": "19581e27de7ced00ff1ce50b2047e7a5",
+ "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -753,11 +753,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "9"
+ "text": "1"
},
{
"type": "UncategorizedText",
- "element_id": "6b86b273ff34fce19d6b804eff5a3f57",
+ "element_id": "7902699be42c8a8e46fbbb4501726517",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -771,11 +771,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "1"
+ "text": "7"
},
{
"type": "UncategorizedText",
- "element_id": "7902699be42c8a8e46fbbb4501726517",
+ "element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -789,11 +789,11 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "7"
+ "text": "2"
},
{
"type": "UncategorizedText",
- "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+ "element_id": "d1429f8178a04f7fc73a66edf10ab8b5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -807,7 +807,7 @@
"filetype": "application/pdf",
"page_number": 4
},
- "text": "2"
+ "text": ""
},
{
"type": "NarrativeText",
@@ -1116,8 +1116,8 @@
"text": "r a e y"
},
{
- "type": "UncategorizedText",
- "element_id": "dca468ba69cda6650ce03d976c274c66",
+ "type": "Title",
+ "element_id": "3f79bb7b435b05321651daefd374cdc6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1131,11 +1131,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "S15"
+ "text": "e"
},
{
- "type": "Title",
- "element_id": "3f79bb7b435b05321651daefd374cdc6",
+ "type": "UncategorizedText",
+ "element_id": "e629fa6598d732768f7c726b4b621285",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1149,11 +1149,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "e"
+ "text": "15"
},
{
"type": "UncategorizedText",
- "element_id": "e629fa6598d732768f7c726b4b621285",
+ "element_id": "dca468ba69cda6650ce03d976c274c66",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1167,7 +1167,7 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "15"
+ "text": "S15"
},
{
"type": "Title",
@@ -1206,8 +1206,8 @@
"text": "r e p s e i t i l"
},
{
- "type": "Title",
- "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
+ "type": "UncategorizedText",
+ "element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1221,11 +1221,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "a t a F"
+ "text": "10"
},
{
- "type": "UncategorizedText",
- "element_id": "4a44dc15364204a80fe80e9039455cc1",
+ "type": "Title",
+ "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1239,7 +1239,7 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "10"
+ "text": "a t a F"
},
{
"type": "UncategorizedText",
@@ -1261,7 +1261,7 @@
},
{
"type": "UncategorizedText",
- "element_id": "5feceb66ffc86f38d952786c6d696c79",
+ "element_id": "8bf40d0515e8461bd30866c2eb8ac250",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1275,11 +1275,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "0"
+ "text": "4.6"
},
{
"type": "UncategorizedText",
- "element_id": "8bf40d0515e8461bd30866c2eb8ac250",
+ "element_id": "c020bad937ece011339d7447ee0ac9fa",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1293,11 +1293,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "4.6"
+ "text": "2.8"
},
{
- "type": "Title",
- "element_id": "51229f9593cbcb7c8e25059c004d67b0",
+ "type": "UncategorizedText",
+ "element_id": "5feceb66ffc86f38d952786c6d696c79",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1311,11 +1311,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "|| es"
+ "text": "0"
},
{
"type": "Title",
- "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
+ "element_id": "51229f9593cbcb7c8e25059c004d67b0",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1329,11 +1329,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "C oal"
+ "text": "|| es"
},
{
"type": "Title",
- "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
+ "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1347,11 +1347,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "Oil"
+ "text": "C oal"
},
{
"type": "Title",
- "element_id": "3a21fb0158c2ea04834163deee74a836",
+ "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1365,11 +1365,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "Bio m ass"
+ "text": "Oil"
},
{
"type": "Title",
- "element_id": "4fabb98454d019811a732c4a09f31bf0",
+ "element_id": "3a21fb0158c2ea04834163deee74a836",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1383,11 +1383,11 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "N atural gas"
+ "text": "Bio m ass"
},
{
- "type": "UncategorizedText",
- "element_id": "c020bad937ece011339d7447ee0ac9fa",
+ "type": "Title",
+ "element_id": "4fabb98454d019811a732c4a09f31bf0",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -1401,7 +1401,7 @@
"filetype": "application/pdf",
"page_number": 5
},
- "text": "2.8"
+ "text": "N atural gas"
},
{
"type": "Title",
@@ -2179,7 +2179,7 @@
},
{
"type": "NarrativeText",
- "element_id": "d85940c91ae6b53fc4b41bd5137e7371",
+ "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2193,11 +2193,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-"
+ "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries"
},
{
- "type": "NarrativeText",
- "element_id": "26a84724035df76d7d8a6610a6fa4627",
+ "type": "Title",
+ "element_id": "5d7f49449ab22deac22d767b89549c55",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2211,11 +2211,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/"
+ "text": "ii"
},
{
- "type": "NarrativeText",
- "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5",
+ "type": "Title",
+ "element_id": "f5557d4fcf727a981a3c315aca733eef",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2229,11 +2229,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-"
+ "text": "iii"
},
{
- "type": "NarrativeText",
- "element_id": "794a96b3ab9a3e860f65549c3a106704",
+ "type": "Title",
+ "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2247,11 +2247,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/"
+ "text": "v"
},
{
- "type": "NarrativeText",
- "element_id": "9a236889bced20048d1619798291d194",
+ "type": "Title",
+ "element_id": "c0ff93ea8927a7366db0331e5fd9d19f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2265,7 +2265,7 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a"
+ "text": "vi"
},
{
"type": "NarrativeText",
@@ -2286,8 +2286,8 @@
"text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP."
},
{
- "type": "Title",
- "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface",
+ "type": "NarrativeText",
+ "element_id": "794a96b3ab9a3e860f65549c3a106704",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2301,11 +2301,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "v"
+ "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/"
},
{
- "type": "Title",
- "element_id": "c0ff93ea8927a7366db0331e5fd9d19f",
+ "type": "NarrativeText",
+ "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2319,7 +2319,7 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "vi"
+ "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-"
},
{
"type": "NarrativeText",
@@ -2340,8 +2340,8 @@
"text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific"
},
{
- "type": "Title",
- "element_id": "f5557d4fcf727a981a3c315aca733eef",
+ "type": "NarrativeText",
+ "element_id": "d85940c91ae6b53fc4b41bd5137e7371",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2355,11 +2355,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "iii"
+ "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-"
},
{
- "type": "Title",
- "element_id": "5d7f49449ab22deac22d767b89549c55",
+ "type": "NarrativeText",
+ "element_id": "9a236889bced20048d1619798291d194",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2373,11 +2373,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "ii"
+ "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a"
},
{
"type": "NarrativeText",
- "element_id": "b6c39a9b3890b5132e4310c83d06b310",
+ "element_id": "26a84724035df76d7d8a6610a6fa4627",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2391,11 +2391,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper."
+ "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/"
},
{
- "type": "NarrativeText",
- "element_id": "c328c06c32c00c43471cd3c9d257c68b",
+ "type": "Title",
+ "element_id": "6e98dee26ce2439cd4b8af82426e894e",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2409,11 +2409,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018"
+ "text": "understanding/statistics"
},
{
- "type": "NarrativeText",
- "element_id": "6bbd046b939157389606adf4059fe1f3",
+ "type": "Title",
+ "element_id": "759772833f6756e511150b2a49233864",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2427,11 +2427,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8"
+ "text": "professional/cancer-statistics/risk"
},
{
- "type": "NarrativeText",
- "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c",
+ "type": "Title",
+ "element_id": "86c0a0cef7faa217f386f75ead17dbec",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2445,11 +2445,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747."
+ "text": "sheets/detail/climate-change-and-health"
},
{
- "type": "NarrativeText",
- "element_id": "d5658e2a49995a2f4ca4b45d95f2058b",
+ "type": "Title",
+ "element_id": "7267222b91f507e040c69dad9af7941f",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2463,11 +2463,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]"
+ "text": "the-full-costs-of-electricity-provision?details=true"
},
{
"type": "NarrativeText",
- "element_id": "e4d7c811a799c3c8e706125556f8a370",
+ "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2481,11 +2481,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712"
+ "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747."
},
{
- "type": "Title",
- "element_id": "6e98dee26ce2439cd4b8af82426e894e",
+ "type": "NarrativeText",
+ "element_id": "e4d7c811a799c3c8e706125556f8a370",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2499,11 +2499,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "understanding/statistics"
+ "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712"
},
{
- "type": "Title",
- "element_id": "759772833f6756e511150b2a49233864",
+ "type": "NarrativeText",
+ "element_id": "98e5f594de0e79990a0650489fdf295c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2517,11 +2517,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "professional/cancer-statistics/risk"
+ "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf"
},
{
- "type": "Title",
- "element_id": "7267222b91f507e040c69dad9af7941f",
+ "type": "NarrativeText",
+ "element_id": "d5658e2a49995a2f4ca4b45d95f2058b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2535,11 +2535,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "the-full-costs-of-electricity-provision?details=true"
+ "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]"
},
{
"type": "NarrativeText",
- "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5",
+ "element_id": "c328c06c32c00c43471cd3c9d257c68b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2553,11 +2553,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries"
+ "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018"
},
{
- "type": "Title",
- "element_id": "86c0a0cef7faa217f386f75ead17dbec",
+ "type": "NarrativeText",
+ "element_id": "6bbd046b939157389606adf4059fe1f3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2571,11 +2571,11 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "sheets/detail/climate-change-and-health"
+ "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8"
},
{
"type": "NarrativeText",
- "element_id": "98e5f594de0e79990a0650489fdf295c",
+ "element_id": "b6c39a9b3890b5132e4310c83d06b310",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@@ -2589,7 +2589,7 @@
"filetype": "application/pdf",
"page_number": 10
},
- "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf"
+ "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper."
},
{
"type": "UncategorizedText",
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
index 0c5382f75d..3607a21cde 100644
--- a/unstructured/partition/utils/sorting.py
+++ b/unstructured/partition/utils/sorting.py
@@ -1,18 +1,62 @@
-from typing import List
+import os
+from typing import List, Tuple
import numpy as np
from unstructured.documents.elements import CoordinatesMetadata, Element
from unstructured.logger import trace_logger
-from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
+from unstructured.partition.utils.constants import (
+ SORT_MODE_BASIC,
+ SORT_MODE_XY_CUT,
+)
from unstructured.partition.utils.xycut import recursive_xy_cut
-def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> List[int]:
+def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> Tuple[int, int, int, int]:
+ """
+ Convert coordinates to a bounding box representation.
+
+ Parameters:
+ coordinates (CoordinatesMetadata): Metadata containing points to represent the bounding box.
+
+ Returns:
+ Tuple[int, int, int, int]: A tuple representing the bounding box in the format
+ (left, top, right, bottom).
+ """
+
points = coordinates.points
left, top = points[0]
right, bottom = points[2]
- return [int(left), int(top), int(right), int(bottom)]
+ return int(left), int(top), int(right), int(bottom)
+
+
+def shrink_bbox(bbox: Tuple[int, int, int, int], shrink_factor) -> Tuple[int, int, int, int]:
+ """
+ Shrink a bounding box by a given shrink factor while maintaining its center.
+
+ Parameters:
+ bbox (Tuple[int, int, int, int]): The original bounding box represented by
+ (left, top, right, bottom).
+ shrink_factor (float): The factor by which to shrink the bounding box (0.0 to 1.0).
+
+ Returns:
+ Tuple[int, int, int, int]: The shrunken bounding box represented by
+ (left, top, right, bottom).
+ """
+
+ left, top, right, bottom = bbox
+ width = right - left
+ height = bottom - top
+ new_width = width * shrink_factor
+ new_height = height * shrink_factor
+ dw = (width - new_width) / 2
+ dh = (height - new_height) / 2
+
+ new_left = left + dw
+ new_right = right - dw
+ new_top = top + dh
+ new_bottom = bottom - dh
+ return int(new_left), int(new_top), int(new_right), int(new_bottom)
def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool:
@@ -37,6 +81,7 @@ def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool:
def sort_page_elements(
page_elements: List[Element],
sort_mode: str = SORT_MODE_XY_CUT,
+ shrink_factor: float = 0.9,
) -> List[Element]:
"""
Sorts a list of page elements based on the specified sorting mode.
@@ -57,6 +102,10 @@ def sort_page_elements(
- List[Element]: A list of sorted page elements.
"""
+ shrink_factor = float(
+ os.environ.get("UNSTRUCTURED_XY_CUT_BBOX_SHRINK_FACTOR", shrink_factor),
+ )
+
if not page_elements:
return []
@@ -82,9 +131,18 @@ def _coords_ok(strict_points: bool):
if sort_mode == SORT_MODE_XY_CUT:
if not _coords_ok(strict_points=True):
return page_elements
- boxes = [coordinates_to_bbox(coords) for coords in coordinates_list]
+ shrunken_bboxes = []
+ for coords in coordinates_list:
+ bbox = coordinates_to_bbox(coords)
+ shrunken_bbox = shrink_bbox(bbox, shrink_factor)
+ shrunken_bboxes.append(shrunken_bbox)
+
res: List[int] = []
- recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
+ recursive_xy_cut(
+ np.asarray(shrunken_bboxes).astype(int),
+ np.arange(len(shrunken_bboxes)),
+ res,
+ )
sorted_page_elements = [page_elements[i] for i in res]
elif sort_mode == SORT_MODE_BASIC:
if not _coords_ok(strict_points=False):