From 0a231397209ba9c081112806ecc42a2f6a0dc19d Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Wed, 16 Aug 2023 12:16:35 -0700 Subject: [PATCH] enhancement: implement full-page OCR(#1133) *implements full-page OCR as supported in unstructured-inference=0.5.11. --- CHANGELOG.md | 8 +- requirements/constraints.in | 2 + requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 6 +- .../azure/IRS-form-1987.png.json | 104 ++++++++---------- .../biomed-api/65/11/main.PMC6312790.pdf.json | 30 +++-- .../biomed-api/75/29/main.PMC6312793.pdf.json | 4 +- .../layout-parser-paper.pdf.json | 32 +++--- .../2023-Jan-economic-outlook.pdf.json | 8 +- .../small-pdf-set/Silent-Giant-(1).pdf.json | 2 +- .../recalibrating-risk-report.pdf.json | 10 -- unstructured/__version__.py | 2 +- 12 files changed, 105 insertions(+), 105 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb93a120dc..70f5e57a8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,10 @@ -## 0.10.1-dev0 +## 0.10.1-dev1 + +### Enhancements +* Bump unstructured-inference==0.5.10: + - implement full-page OCR + +### Features ### Fixes * Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing) diff --git a/requirements/constraints.in b/requirements/constraints.in index e35fcbfd5a..b6e08dc278 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -25,3 +25,5 @@ Pillow<10.0.0 # NOTE(alan) Pinned to avoid error that occurs with 2.4.3: # AttributeError: 'ResourcePath' object has no attribute 'collection' Office365-REST-Python-Client<2.4.3 +# NOTE(christine) Pinned to set the `unstructured-inference` version +unstructured-inference==0.5.10 \ No newline at end of file diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index c483952e8a..b89ae5a602 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -6,4 +6,4 @@ pdfminer.six # NOTE(robinson) - See this issue here # https://github.com/facebookresearch/detectron2/issues/5010 Pillow<10 -unstructured-inference==0.5.9 +unstructured-inference diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 923b0b366e..e90da66019 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -205,8 +205,10 @@ typing-extensions==4.7.1 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.5.9 - # via -r requirements/extra-pdf-image.in +unstructured-inference==0.5.10 + # via + # -c requirements/constraints.in + # -r requirements/extra-pdf-image.in urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 67312ec63b..09e9c57225 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -1,17 +1,17 @@ [ { "type": "Title", - "element_id": "0c4e18d78e721c8179f3946b75b17d15", + "element_id": "88591a76b54e47215c0827ae8838ec13", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Instructions for Form 3115 (Rev. November 1987) Annlicatinn far Chance in Accounting Mathond" + "text": "Instructions for Form 3115 (Rev. November 1987)" }, { "type": "NarrativeText", - "element_id": "41f3d9c83b2b4679195c9796134fd8f5", + "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", "metadata": { "data_source": {}, "filetype": "image/png", @@ -21,7 +21,7 @@ }, { "type": "ListItem", - "element_id": "97968e4ba14bd2d082a70ec61ef2d9b1", + "element_id": "36a565493a214d3f7e7f24794c1dc7f4", "metadata": { "data_source": {}, "filetype": "image/png", @@ -111,7 +111,7 @@ }, { "type": "ListItem", - "element_id": "f0d2beb7f43493694a91137e8e65b5f3", + "element_id": "59bc2945a7f606bd5078bac3bc1199d4", "metadata": { "data_source": {}, "filetype": "image/png", @@ -121,7 +121,7 @@ }, { "type": "ListItem", - "element_id": "13f2a282f705590fbe7b6ce15b08862a", + "element_id": "5157d731aa6a97c9b166799db2295bce", "metadata": { "data_source": {}, "filetype": "image/png", @@ -141,7 +141,7 @@ }, { "type": "ListItem", - "element_id": "9820f79275e683f5afe3f2f1283de4ca", + "element_id": "34b66452ca63c465c69d849e4acf6d46", "metadata": { "data_source": {}, "filetype": "image/png", @@ -161,7 +161,7 @@ }, { "type": "ListItem", - "element_id": "a98378f4a88db65dff42b7d8bd75be92", + "element_id": "b0fa5aaff0cee8574822dd8ac6537c06", "metadata": { "data_source": {}, "filetype": "image/png", @@ -181,7 +181,7 @@ }, { "type": "ListItem", - "element_id": "3cb57c50002187a715e1c5048e643c65", + "element_id": "13f155c0754434406190f3cf49c82c3c", "metadata": { "data_source": {}, "filetype": "image/png", @@ -201,33 +201,33 @@ }, { "type": "ListItem", - "element_id": "beeb50db70ce1aa76813cce98e46bd56", + "element_id": "178d6933ed193747b1c4aa1c048e7f94", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "for these changes. Tb od Db bee Cl" + "text": "for these changes." }, { "type": "NarrativeText", - "element_id": "640a100da1a3bee6f1f134c51a2c8648", + "element_id": "7685df2334a5f6c8c8099dea61a8f1b4", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed" + "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." }, { "type": "Title", - "element_id": "a232d246e22a4f6bb8dcab62cffb2567", + "element_id": "61ed58fa51293f429f87e8cf1896c9e4", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Paperwork Reduction Act Notice We ack for thic infarenatinn te marry mye the." + "text": "Paperwork Reduction Act Notice" }, { "type": "Title", @@ -241,37 +241,27 @@ }, { "type": "ListItem", - "element_id": "58f1649a32eda8b8c513e51a209666a6", + "element_id": "5f8051f8010896bab02aaf784c04ae02", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Signature Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page" - }, - { - "type": "ListItem", - "element_id": "586e989b479e4362ebe28a6954c1427b", - "metadata": { - "data_source": {}, - "filetype": "image/png", - "page_number": 1 - }, - "text": "If the individual or firm is also authorized to" + "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page" }, { "type": "NarrativeText", - "element_id": "446ccb7d96fea659d50aef8a6dd670df", + "element_id": "4660422c06dddc914ab634c5e4045dec", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the right amount of tax. You are required to give us this information," + "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to give us this information." }, { "type": "Title", - "element_id": "226fa83297914d5195e002508d61fb1d", + "element_id": "a1547a4ed1611eee44b15e99120fb978", "metadata": { "data_source": {}, "filetype": "image/png", @@ -281,77 +271,77 @@ }, { "type": "Title", - "element_id": "f0e951e5bcb4a6070fa6672b37822348", + "element_id": "68a3289177b49b285e133a5267eb355f", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Purpose of Form Cin bce Secon te cece cget." + "text": "Purpose of Form" }, { "type": "NarrativeText", - "element_id": "5e5451e052baf894b2bdad4132f6cd2f", + "element_id": "f9b8e17da7a31507773f78959378e09c", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "ee File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods," + "text": "File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods," }, { "type": "NarrativeText", - "element_id": "cc1701e3ce9347e344b3df80d426bd21", + "element_id": "b3859f2f29884b1d3ba0892e52859a99", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Seti aes When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)" + "text": "When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)" }, { "type": "NarrativeText", - "element_id": "b81dc18d0f8666f9bf7400a00657dc72", + "element_id": "e5a95dc10d4071983b70898a21f11175", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "POMS SANE OPFOR DA 29). Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired. You must give alll relevant facts, including a" + "text": "Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired." }, { "type": "Title", - "element_id": "c7502aa5b000d6446f3eca882518a260", + "element_id": "5756fb398995bb6518a87637f24f426e", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Time and Place for Filing amarall, ammlimeete maet file snete" + "text": "Time and Place for Filing" }, { "type": "NarrativeText", - "element_id": "8b35e7c212710b1099b675ce9394fb47", + "element_id": "25f830e7c39c115c9937eb9d11cfb1f2", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Se NB ON State whether you desire a conference in the National Office if the Service proposes to disapprove your application." + "text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application" }, { "type": "Title", - "element_id": "0a16a0fea889be77576c0fd88575554a", + "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Affiliated Groups Tavmayare that ara mam)" + "text": "Affiliated Groups" }, { "type": "Title", - "element_id": "68b58298cabd9069c975b192a7183139", + "element_id": "242a9dba10a04654d4adef9c58ff96f6", "metadata": { "data_source": {}, "filetype": "image/png", @@ -361,62 +351,62 @@ }, { "type": "Title", - "element_id": "6a8881a6e87021b2362243f7df3e4b1d", + "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on cash method.—If you are required to char" + "text": "Uniform capitalization rules and limitation on" }, { "type": "Title", - "element_id": "8daeb8b48fb666f1dd54e2af283d0c22", + "element_id": "58703de56debc34a1d68e6ed6f8fd067", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Specific Instructions Section A Neem Ea mama 1 !Taeahle inemes" + "text": "Specific Instructions Section A" }, { "type": "Title", - "element_id": "09203a0c6955f64ca8eb52cd6ea47034", + "element_id": "a4316c02df07840f1beb56609cb09735", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Late Applications Me coup armlimatinm te ler" + "text": "Late Applications" }, { "type": "NarrativeText", - "element_id": "962e3f0ceb1f0b1b08a1c19adde8d962", + "element_id": "39458f370b98a606db29ac6dee975e07", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "lethal elaine bela Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the" + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and" }, { "type": "Title", - "element_id": "bfe98eb672d95c15a11ed3e618928b4e", + "element_id": "025a65465b6fd9635316e92633b24c7e", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Identifying Number Ndiuidesale Am omptisoehesal" + "text": "Identifying Number" }, { "type": "NarrativeText", - "element_id": "87f8128b03a72c616ee1a1bb91e11c56", + "element_id": "9240bfa889b87dc2fb3fa746ca4eeeb4", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "—e—e—— eee Others.-—The employer identification number of an applicant other than an individual should be entered in this block," + "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block," } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 778054dbf6..501fecab5d 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -1111,13 +1111,13 @@ }, { "type": "FigureCaption", - "element_id": "b5ee6af3d776b0bbd2e581a3ab2ab2e1", + "element_id": "27b45633a0f31b9e01d179d70d7dc282", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)" + "text": "5 1 os = — 10; =o ° © —\" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)" }, { "type": "UncategorizedText", @@ -1141,13 +1141,13 @@ }, { "type": "Table", - "element_id": "e2ed41967a486766ad6a122cc3aba4d5", + "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Inhibitorconcentration (g) bc (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm2) Polarizationresistance (Ω) Corrosionrate (mm/year) 0246810 0.03351.94600.01630.32330.12400.0382 0.04090.05960.23690.05400.05560.0086 (cid:3) 0.9393(cid:3) 0.8276(cid:3) 0.8825(cid:3) 0.8027(cid:3) 0.5896(cid:3) 0.5356 0.00030.00020.00015.39E-055.46E-051.24E-05 24.0910121.44042.121373.180305.650246.080 2.81631.50540.94760.43180.37720.0919" + "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919" }, { "type": "UncategorizedText", @@ -1471,13 +1471,13 @@ }, { "type": "FigureCaption", - "element_id": "6959a323ee23c858c3b1411b05db6ebf", + "element_id": "273fb301b173075f79b2cbdab962e2ff", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEM HV: Q0KY WD: 14.89 rmrm‘DEM MAO: 209 x Det: DOE Pecforsence In nenospact" + "text": "SEM HV: Q0KY WD: 14.89 rmrm ‘9EM MAO: 209 x Det: DOE Pectomsence In nanospact" }, { "type": "NarrativeText", @@ -1491,13 +1491,13 @@ }, { "type": "FigureCaption", - "element_id": "a0463ca888a6f2c8c3ba40ba47be0f2f", + "element_id": "d04d110c16a4ebc184fa130f09b8d423", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "gEOOwaeSemny. z00RV | WD: 1424 renn rtirint VEoa3 Tescan20 yin Fertormaros in nancepace|" + "text": "Sem ny. 200 Rv" }, { "type": "NarrativeText", @@ -1511,13 +1511,13 @@ }, { "type": "FigureCaption", - "element_id": "a9bc28448ebad437288bf5538fb09482", + "element_id": "520d1da08c86ce165cd2843e2dc27f98", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEM HY: 20.0KVBEM IAAG: 400 x 5" + "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE" }, { "type": "NarrativeText", @@ -1579,6 +1579,16 @@ }, "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9." }, + { + "type": "FigureCaption", + "element_id": "060e14f01e484ba252e902cd5c6f94f9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "ou H,;COCHNY OH" + }, { "type": "UncategorizedText", "element_id": "c07eeb615f8b0f2d544348b7f0655301", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index c098282d8d..3b2359c6f4 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -791,13 +791,13 @@ }, { "type": "Table", - "element_id": "be8fbf813482eec7fd0e2fc665b4d3bb", + "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Instance size (m, n) Average number of (8, 1500)(8, 2000)(8, 2500)(8, 3000)(12, 1500)(12, 2000)(12, 2500)(12, 3000)(16, 1500)(16, 2000)(16, 2500)(16, 3000) Locations Times Vehicles Possible empty travels 568.40672.80923.40977.00566.00732.60875.001119.60581.80778.00879.001087.20 975.201048.001078.001113.20994.001040.601081.001107.40985.401040.601083.201101.60 652.20857.201082.401272.80642.00861.201096.001286.20667.80872.401076.401284.60 668,279.401,195,844.801,866,175.202,705,617.00674,191.001,199,659.801,878,745.202,711,180.40673,585.801,200,560.801,879,387.002,684,983.60" + "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index e7160d5851..be9576d0fb 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -591,13 +591,13 @@ }, { "type": "FigureCaption", - "element_id": "00401461c83b8b07511a4864781d8f8d", + "element_id": "812dcaaec927a84d57af36e20adb5ded", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Model Customization Community PlatformEfficient Data Annotation ¥y DIA Model HubCustomized Model Training] === | Layout Detection Models | ===OCR Module =— | Layout Data Structure | ==The Core LayoutParser LibraryDIA Pipeline SharingStorage & Visualizationi" + "text": "Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY" }, { "type": "NarrativeText", @@ -681,14 +681,14 @@ }, { "type": "Table", - "element_id": "71e289a268220c21575bb55a73980b83", + "element_id": "34923b77ca76e1808956ade5e766f7c2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet[38] F/MLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
" }, - "text": "Dataset Base Model1 Large Model Notes PubLayNet [38]PRImA [3]Newspaper [17]TableBank [18]HJDataset [31] F / MMFFF / M M--F- Layouts of modern scientific documentsLayouts of scanned modern magazines and scientific reportsLayouts of scanned US newspapers from the 20th centuryTable region on modern scientific and business documentLayouts of history Japanese documents" + "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" }, { "type": "UncategorizedText", @@ -852,13 +852,13 @@ }, { "type": "FigureCaption", - "element_id": "2f498bdd91739a7083490999507420a5", + "element_id": "185e67615d123b35d38ea72e0cdb6d99", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "33§3 fectange vada8883 Coordinate83 +*Block | [Block | [Read8 Extra features Tet | [Tye | [oder[ coordinatel textblock1 |» , see383 , textblock2 , layout] ]4A list of the layout elementsThe same transformation and operation APIs" + "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" }, { "type": "NarrativeText", @@ -1072,14 +1072,14 @@ }, { "type": "Table", - "element_id": "548c38f86edc295baf869abe37a0d1cf", + "element_id": "f81d4915b54758e0d4d52af3566bb813", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "text_as_html": "
Operation Name|Description
block.pad(top, bottom,right,left) |Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
blocki.is_in(block2)|Whether block] is inside of block2
blocki.intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs.
block1i.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs.
blocki.relative_to(block2)Convert the absolute coordinates of block] to relative coordinates to block2
blocki.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
" }, - "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) block.shift(dx, dy) Scale the current block given the ratioin x and y direction Move the current block with the shiftdistances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) block1.union(block2) block1.relative to(block2) block1.condition on(block2) Return the intersection region of block1 and block2.Coordinate type to be determined based on the inputs. Return the union region of block1 and block2.Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block1 torelative coordinates to block2 Calculate the absolute coordinates of block1 giventhe canvas block2’s absolute coordinates Obtain the image segments in the block region" + "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region" }, { "type": "UncategorizedText", @@ -1343,13 +1343,13 @@ }, { "type": "FigureCaption", - "element_id": "6df6057f894a166cf24fd34f64267f09", + "element_id": "975d6cb141cb0a0313375630ae063fa8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "a ESStee eaeoooMode I: Showing Layout on the Original ImageMode Il: Drawing OCR'd Text at the Correspoding Position10g Bpunog vayoy feyds1q :1 vondo‘xog Burpunog vay apiH z word" + "text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position" }, { "type": "NarrativeText", @@ -1413,13 +1413,13 @@ }, { "type": "FigureCaption", - "element_id": "42aa5660e30073a0282c086fe4f29fce", + "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Column reading orderMaximum Allowed HeightZ. Shen et al.Intra-column reading order(b) Illustration of the recreated document with dense text structure for better OCR performance‘Token CategoriesMoteAddresstetNumberVaribiecompany typeColumn Categories(J tite| Aatress(tee[7] section adr" + "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance" }, { "type": "NarrativeText", @@ -1533,13 +1533,13 @@ }, { "type": "FigureCaption", - "element_id": "55f2474c66877608ca9b463a7076573e", + "element_id": "b33b2bc3b9c416673c7f74c6a00c49d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "(spe peepee,Active Learning Layout=Annotate Layout Dataset parte4zi Deep Learning LayoutLayout Detection Model Training & Inference,Post-processin Handy Data Structures &pl 9 APIs for Layout DataText Recognition Default and Customized: r OCR Models4Visualization & Export | <——Layout StructureVisualization & StorageThe Japanese Document Helpful LayoutParserDigitization Pipeline Modules" + "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules" }, { "type": "UncategorizedText", @@ -1713,13 +1713,13 @@ }, { "type": "FigureCaption", - "element_id": "f58d47bde7ebddd81c4a678c918a8f1b", + "element_id": "7d42bb6af1404a95a6e8870d5c4d07bf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 13 }, - "text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" + "text": "(@) Partial table at the bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 9590479404..219f1ef539 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -751,13 +751,13 @@ }, { "type": "Table", - "element_id": "8dec233e9bc75c7256a28a899794709b", + "element_id": "63bdc79def2500227001ac95d78727ab", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Estimate2022 Projections 2023 2024 2021 WEO Projections 1/ 2023 2024 Estimate2022 Projections 2023 2024 Difference from October 2022 Q4 over Q4 2/ World Output Advanced Economies United States Euro Area Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights) World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/ 6.2 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 6.0 5.5 3.8 4.1 7.0 4.1 10.4 9.4 12.1 65.8 26.4 4.7 3.1 5.9 3.4 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 3.1 3.7 5.2 5.4 3.8 4.9 5.4 6.6 3.4 39.8 7.0 8.8 7.3 9.9 2.9 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 2.4 0.7 4.3 3.2 4.0 4.9 2.4 2.3 2.6 3.1 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 2.5 1.8 4.7 3.5 4.1 5.6 3.4 2.7 4.6 –16.2 –6.3 6.6 4.6 8.1 –7.1 –0.4 4.3 2.6 5.5 0.2 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 0.0 –0.3 –3.3 –0.1 0.1 0.2 0.0 –0.1 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 –0.3 –0.4 0.0 –0.9 0.3 0.2 0.2 0.2 1.9 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 1.7 1.8 3.7 . . . 2.5 . . . . . . . . . . . . 11.2 –2.0 9.2 7.8 10.4 3.2 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 2.5 1.2 5.7 . . . 5.0 . . . . . . . . . . . . 3.0 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 2.5 2.0 4.0 . . . 4.1 . . . . . . . . . . . . –9.8 1.4 5.0 3.1 6.6 –5.9 –0.2 3.5 2.3 4.5 Q4 over Q4 2/ eee na RA Estimate_ le Projections Estimate_ le Projections Cetimatel Oo [re ee~T Se 2022 — 2022 — anna es battee eesaanShaeaan eeeanne anan aaa ——eeaaa World Output 6.2 0.2 19 eveee eve nmeAdvanced EconomiesTinta Chetan 54LO 27On 14an 04nA baal1.3nz 1.649 eee meeeUnited StatesCo Ae STN taeEuro Area‘Rareram) Vee5.3 vir0.2 ENGermanyCoen ww26no 2 « Ve04An eianyFranceleah, ov68 MAE07 vir0.0 ree0.5 vv0.9 ve67ce ev3.9— ve08na vwna vee24a4 vee0.14a NaySpain vee24no aNN+ “penJapanVithed Vinmdaee, we24— wwe04a voro!United KingdomPanada et76 Vw eeCanadaAbn Ndime nnd Camnaeeinn O) vee15nA vwnn vr23“aA INOSOther Advanced Economies 3/ wu5.3 ww28 Ld20 idl24 vw0.3 ow14 te24 Lindl22 MEE REEmerging Market and Developing Economiesnee a ree nee25 }et5.0 “eae a eee eeg and Developing Asia Me74 “~~04 ail3.4 EITC TSHTS GID MIC VOIOP HY PwictChinabese at vie45no Vt08An vt29aD ineIndia 4/ oT87 liad68 ve0.0 og43 Emerging and Developing EuropeDa. 15na 264 SET IMS MeN Pig SMRussiaLatin Amarina and the Carihhaan ve47mn al26na 44on 1.040 ENONLatin America and the CaribbeanDeasil iN oN04no nA OLEAN GM NENS aieBrazilMavien is ov28a7 eenMexicoRoeldia Cast end antral Asia ue0.5na eu3.7 oe19 bichinanbaiastelMiddle East and Central AsiaCad; Apahia 45ao vat-04aa ) veeAg 7 ac ee ON EN EENSaudi ArabiaCok Gahacen Af aia3.2az uw87— Ve26— ue3.4Aa Ne14na 46 27 35 ve eeSub-Saharan AfricaAlinaria ve47 on ofa4 0 ENNNigeriaOak AEinn ve3.0ne INIQe TeSouth Africa vw49 wu26 ve12 dl1.3 Ve041 vw0.0 ov3.0 vet0.5 og18 STMemorandumUae Pereath Beene an Marat Conhanns Dates BEES GEESIAISTWorld Growth Based on Market Exchange Ratesco -041 TN EE BO VN ene ee eeEuropean UnionACCA EE! vv5.5 vl37fo Lal0.0 aurea eatASEAN-5 5/Reebde Cred ered Rbesie Adslan ond38aa ve0.2nA dad0.2— ov4.0 rawMiddle East and North Africa ve44 del-04 vee vee NS EO EN EENEmerging Market and Middle-Income Economies 25 5.0 44 ee eee, Oe eee eeeLow-Income Developing Countries 44 awe nan eee eeeWorld Trade Volume (goods and services) 6/Aduannad Enanamineneo Teh10.4oO” Te2400 uM0.1ann ON ES NOES OE DESO NEAdvanced Economieser mee9.4“ana “rt23ne Mee0.0— iawn eensEmerging Market and Developing Economies oT12.1 vw3.4 ow26 of46 vw0.3 ~~ BESTT MIGINGL ait event eSCommodity Prices=) EIN!Oil 7/be® an! fesempnses became! nun taniied ananenelieeeS 65.8me A 39.8—- -16.2— 11.2an MirNonfuel (average based on world commodity import weights) V026.4 ve.0 I-0.4 Te-0.1 we Tew14 DMI! \\arvliagy Veo nN Ay aeWorld Consumer Prices 8/Aehiannedd Kamnemnian fii ae47nA Se MESES EIS OFAdvanced Economies 9/Emarninn Marbat and Navalaninn Erancs ee34 Lda0.2 02nd oe78ana ve2.3 rE RS OTEmerging Market and Developing Economies 8/ ) 10.4 fame i i al oe vt =: Lone aNate’ Ranl affective auchanna mtoe ara aeaamnad tn ramain oanatant atthe laugle nraunlinn dinna Oeinhar dA D00_Nevemherd] O00) Fenny nine noe ct Tho an _\"iT iartearhy" + "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45," }, { "type": "Image", @@ -2051,13 +2051,13 @@ }, { "type": "NarrativeText", - "element_id": "50b5d76376e9904469f2e97d882e4923", + "element_id": "ba23de0762dea86fd9cd418884203f6c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024. ee ee ee ee eee ee ee ee en VEY MUSUE £0, CULE NSTI £0), LLL. EVIMSS nD ee A le ee A ee. eedata are seasonally adjusted. WEO = World Economic Outlook.4) Niffarance haced an minded ficuree far the current and Ortnher 21199 WEO fnreracte Crntriee whnee frracacte have heen rndated relative ta Octnher 9099 WEO fnreracte areniint far annmyimately @N nerrent af vn nS ee a Ee1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent ofward CDP meacired at nirchacinn.nqwer.narity weidhte aon rn eer cnn nn eee ene nooner ee ce nT NS TES RSS TUNE UGSTS TIAN DG ET BPMN PEIGEINIE BO MIRE EU EL EM TUNE UISES GIULUUNE TON GPPIU ANTES YY a Pere orworld GDP measured at purchasing-power-parity weights.Of Ear Ward Outnit (Emerinn Market and Nevelanina Eennamiec) the niarterly ectimates and nmiertinne arenunt far annmnyimately 0 nerrent (21) nerrent\\ af annital ward (emerning market and develanina ane so ee ee ee Se21 For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developinga a i TOP WOnd VUIDUT Lemerging Warket and Meveloping CCOnOMmIssS), ING QUanEehy SSumTiales ANG POSCUONS SCCOUNT ION SPPrOAxIMmalely U7 PErlent (OV Percent) OF ANnWal WORD LeMerging Markel ana Ceveloping:economies’) output at purchasing-power-parity weights.Qi Eychidee the Crain af Gaven (Canada Eranre Germany Italy lanan IInited Kinadam |Inited Statee) and enn area enuntries GCONOIISS 7 OUTIUE Gt PUNAIBSHE POWSr pany WOR IS.3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries.Al Fee leviia dais and nenjartinne era rraeerdad nao feral usar hasic wdth EN ON09/0 fetartinn in Anrll 9099) ahrasn i tha 9099 enkenn Indisla arrenth nederiane ara K A nerant in 902 and @ A nerrant in O04 hassel areata Nae kt a i ca ae cd ea4/For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based‘an calendar vear on eee rece c eres oie eee ee seen ence em een ne rene eae re ene enc Ne Cre ON TNS QUOT PERTERBU ITS GES Oe POTION TY NES GRE PONGEM £0 es Vaseon calendar year.By indaneaia Adalausia Dhiinnines Ginnannra Thajland UNL Catena Year.51 Indonesia, Malaysia, Philippines, Singapore, Thailand.2) Cennla auarana nfrrrmsth ratae fw ecrnrt and imnnrtunhimas (arerte and eerviras) SONGS, MAA YS, PMP EGS, OINGEPONS, Tana.6/ Simple average of growth rates for export and import volumes (goods and services).7) Simnie averane of nricec afl IK Rrent Nithai Fateh and Wect Tevac Intermediate cnide nil The averane aceiimed nrre af nil in | 1S doilare a harre! haced nn firttiree markete (ac nf Navemher 20 2199) jc £21 12 in ee ee eS Mn OA en) Ree \\yue a) Oy71 Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in9092 and @7% 2B in 9004. Eee ee ae eee ee ee eee ee ee ee ne ee ee eee Rr en a ee TE IS aS UE 20, LE,2023 and $75.36 in 2024.8) Eychidee Venesiala ENED A BED MN ee8/ Excludes Venezuela.Q/ The inflatinn rate far the enn area je & 7%, in 992 and 2 2% in 9094 that far Janan ie 2 2% in 9092 and 2N% in INDd and that far the | Inited States ie 4 (1% in 9092 and 2 9% in ONDA. eae Soe91 The infiation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024. fame i i al oe vt =: Lone aNate’ Ranl affective auchanna mtoe ara aeaamnad tn ramain oanatant atthe laugle nraunlinn dinna Oeinhar dA D00_Nevemherd] O00) Fenny nine noe ct Tho an _\"iT iartearhy" + "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 20: data are seasonally adjusted. WEO = World Economic Outlook. 1 Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 21 For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies’) output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 51 Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/'Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela 91 The inflation rate for the euro area is 6.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024. November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index 0b38f8363e..4bf2b7f27d 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -1701,7 +1701,7 @@ }, { "type": "FigureCaption", - "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", + "element_id": "eeda9f9210dfe4be7e82b4385290d3ca", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 5cef685c27..98204b90f0 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -1119,16 +1119,6 @@ }, "text": "6" }, - { - "type": "FigureCaption", - "element_id": "f58b520072d30c4805940f5c99a306c3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "an ¥3 te,ay." - }, { "type": "NarrativeText", "element_id": "d754d8d468346f652657279272a11897", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cc242ee053..35e13d808f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.1-dev0" # pragma: no cover +__version__ = "0.10.1-dev1" # pragma: no cover