From 65344117b178b35a3e766d7ae13196cd791b2ddb Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 31 Aug 2023 21:27:48 -0700 Subject: [PATCH] enhancement: entire page OCR output included with hi_res (#1263) Bumps unstructured-inference==0.5.19 to bring in @christinestraub's enhancement https://github.com/Unstructured-IO/unstructured-inference/pull/186 . This is a **massive** improvement where previously omitted text was not included in `hi_res` output if the layout model had not put a bounding box around it. In addition, the xycut sorting algorithm generally does a good job of ordering the merged OCR-text-not-in-layout-model bboxes with layout-model bboxes into "natural reading order." More details in https://github.com/Unstructured-IO/unstructured-inference/pull/186#issuecomment-1700438645 . Bonus: changelog fix. --- CHANGELOG.md | 8 +- requirements/constraints.in | 2 +- requirements/extra-pdf-image.txt | 2 +- requirements/test.txt | 2 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 704 +++++++++++++++++- .../azure/IRS-form-1987.pdf.json | 400 ++++++++-- .../azure/IRS-form-1987.png.json | 176 ++++- .../biomed-api/65/11/main.PMC6312790.pdf.json | 290 +++++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 120 ++- .../layout-parser-paper.pdf.json | 220 +++--- .../2023-Jan-economic-outlook.pdf.json | 610 +++++++++++---- .../small-pdf-set/Silent-Giant-(1).pdf.json | 468 +++++++++--- .../recalibrating-risk-report.pdf.json | 278 ++++--- unstructured/__version__.py | 2 +- 14 files changed, 2618 insertions(+), 664 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 744245a6bc..e2798eed35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,18 @@ -## 0.10.11-dev0 +## 0.10.11 ### Enhancements +* Bump unstructured-inference + * Combine entire-page OCR output with layout-detected elements, to ensure full coverage of the page (0.5.19) + ### Features * Add in ingest cli s3 writer ### Fixes +* Fix a bug where `xy-cut` sorting attemps to sort elements without valid coordinates; now xy cut sorting only works when **all** elements have valid coordinates + ## 0.10.10 ### Enhancements @@ -44,7 +49,6 @@ * Edit `add_pytesseract_bbox_to_elements`'s (`ocr_only` strategy) `metadata.coordinates.points` return type to `Tuple` for consistency. * Re-enable test-ingest-confluence-diff for ingest tests * Fix syntax for ingest test check number of files -* Fix a bug where `xy-cut` sorting attemps to sort elements without valid coordinates; now xy cut sorting only works when **all** elements have valid coordinates ## 0.10.8 diff --git a/requirements/constraints.in b/requirements/constraints.in index 08fe6e3f2a..f0eb217c63 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -26,7 +26,7 @@ Pillow<10.0.0 # AttributeError: 'ResourcePath' object has no attribute 'collection' Office365-REST-Python-Client<2.4.3 # NOTE(christine) Pinned to set the `unstructured-inference` version -unstructured-inference==0.5.18 +unstructured-inference==0.5.19 # NOTE(klaijan) - Moved pin from test.in # pinning to avoid error in argilla library pydantic<2 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index a9e6b4a44c..e53a5de473 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -209,7 +209,7 @@ typing-extensions==4.7.1 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.5.18 +unstructured-inference==0.5.19 # via # -c requirements/constraints.in # -r requirements/extra-pdf-image.in diff --git a/requirements/test.txt b/requirements/test.txt index d427fa40da..fe30df8ffe 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -41,7 +41,7 @@ idna==3.4 # yarl iniconfig==2.0.0 # via pytest -label-studio-sdk==0.0.30 +label-studio-sdk==0.0.32 # via -r requirements/test.in label-studio-tools==0.0.3 # via label-studio-sdk diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 4af465f1a8..120b2dee99 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -1,4 +1,34 @@ [ + { + "type": "UncategorizedText", + "element_id": "cf66bb0e9e68e3a82a99b5621e4394f8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Core Skills for Biomedical Data" + }, + { + "type": "UncategorizedText", + "element_id": "733383a5f0f5bdea71d6d48805365e6f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Scientists" + }, + { + "type": "UncategorizedText", + "element_id": "64b2134f054446d473fce1b05d4d4c94", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow" + }, { "type": "UncategorizedText", "element_id": "a81f2feee790a4c2cf749889073d947a", @@ -9,6 +39,96 @@ }, "text": "Lisa Federer, MLIS, Data Science Training Coordinator" }, + { + "type": "UncategorizedText", + "element_id": "f089eaef57aba315bc0e1455985c0c8e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Michael" + }, + { + "type": "UncategorizedText", + "element_id": "fd0a559e715a134218c73276dc57d463", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "F." + }, + { + "type": "UncategorizedText", + "element_id": "44be44eccd482217c097571ddfa61f49", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Huerta," + }, + { + "type": "UncategorizedText", + "element_id": "394df19f0626f36d12da449624b691f9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "PhD, Associate" + }, + { + "type": "UncategorizedText", + "element_id": "4f5a6389c571b0d01690b1db0349c1b4", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Director of" + }, + { + "type": "UncategorizedText", + "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "NLM" + }, + { + "type": "UncategorizedText", + "element_id": "237622d8c80fbdbe790b92d500aa7b00", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "for Program Development and" + }, + { + "type": "UncategorizedText", + "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "NLM" + }, + { + "type": "UncategorizedText", + "element_id": "ba490653e1ad81f341c35ae470c1b825", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Coordinator of Data Science and Open Science Initiatives" + }, { "type": "UncategorizedText", "element_id": "c8e51fdc53c202393adad77f7f93ee5a", @@ -19,6 +139,156 @@ }, "text": "Executive Summary" }, + { + "type": "UncategorizedText", + "element_id": "2364a6d2f9a3858d51d91b817732e6c9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs." + }, + { + "type": "UncategorizedText", + "element_id": "6712d87f1d156abf6171f700e2875889", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "biomedical" + }, + { + "type": "UncategorizedText", + "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "data" + }, + { + "type": "UncategorizedText", + "element_id": "50e891aa619a7ccbeab043789ca5dd1a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "programs," + }, + { + "type": "UncategorizedText", + "element_id": "6201111b83a0cb5b0922cb37cc442b9a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "and" + }, + { + "type": "UncategorizedText", + "element_id": "a703788f832056626d71b7db4d805524", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "minimal" + }, + { + "type": "UncategorizedText", + "element_id": "6ee0eb490ff832101cf82a3d387c35f2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "set" + }, + { + "type": "UncategorizedText", + "element_id": "10c22bcf4c768b515be4e94bcafc71bf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "for" + }, + { + "type": "UncategorizedText", + "element_id": "28391d3bc64ec15cbb090426b04aa6b7", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "of" + }, + { + "type": "UncategorizedText", + "element_id": "6712d87f1d156abf6171f700e2875889", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "biomedical" + }, + { + "type": "UncategorizedText", + "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "core" + }, + { + "type": "UncategorizedText", + "element_id": "50c5080f67ea1f9eff473e46e6314fd2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "skills for biomedical" + }, + { + "type": "UncategorizedText", + "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "data" + }, + { + "type": "UncategorizedText", + "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "data" + }, + { + "type": "UncategorizedText", + "element_id": "18f107bf25f694db07b6aba0a5aaf321", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Suggested high-level core skills include:" + }, { "type": "UncategorizedText", "element_id": "04ff84b51fab69c07381ac794b740243", @@ -99,6 +369,16 @@ }, "text": "Motivation" }, + { + "type": "UncategorizedText", + "element_id": "3d8fbacaba9067faef48850d43801268", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2k) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" + }, { "type": "UncategorizedText", "element_id": "9fc51802fc970310e99a77b9f29af9ab", @@ -111,13 +391,433 @@ }, { "type": "UncategorizedText", - "element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0", + "element_id": "326e7d081e9418423ea62bf3802caaa3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." + "text": "this commitment, recent report to the NLM Director recommended working across identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." + }, + { + "type": "UncategorizedText", + "element_id": "ca978112ca1bbdcafac231b39a23dc4d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "a" + }, + { + "type": "UncategorizedText", + "element_id": "acc8586a874eb74f10c3f90620f20617", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "NIH to" + }, + { + "type": "UncategorizedText", + "element_id": "f26d07e6b71e42596791a241e2417931", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Methodology" + }, + { + "type": "UncategorizedText", + "element_id": "b344d80e24a3679999fa964450b34bc2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "The" + }, + { + "type": "UncategorizedText", + "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "required of" + }, + { + "type": "UncategorizedText", + "element_id": "aa3b88196a6407c3866c85acdcc8c981", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Workforce" + }, + { + "type": "UncategorizedText", + "element_id": "b72b62f1295c66f199256c1190177ce6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "three-pronged approach biomedical data scientist (BDS), drawing from:" + }, + { + "type": "UncategorizedText", + "element_id": "3d366201f5b88bcbfafb078aee5f2a55", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Excellence" + }, + { + "type": "UncategorizedText", + "element_id": "ca8b22d0db83a22db163b560b3e4e515", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "team" + }, + { + "type": "UncategorizedText", + "element_id": "e0a6230e370d20dece7ca96c77611cb0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "took" + }, + { + "type": "UncategorizedText", + "element_id": "ca978112ca1bbdcafac231b39a23dc4d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "a" + }, + { + "type": "UncategorizedText", + "element_id": "663ea1bfffe5038f3f0cf667f14c4257", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "to" + }, + { + "type": "UncategorizedText", + "element_id": "a5bed2020bd1f4ea3eca933398c4f0d0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "identifying" + }, + { + "type": "UncategorizedText", + "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "core" + }, + { + "type": "UncategorizedText", + "element_id": "32c1cf49a2feee269ed74dd860f72644", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "skills" + }, + { + "type": "UncategorizedText", + "element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." + }, + { + "type": "UncategorizedText", + "element_id": "301d35f1042e1eac9fdef8839fd13a4e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "b)" + }, + { + "type": "UncategorizedText", + "element_id": "1117af46b0a22dd02d3869ab9738a8a8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." + }, + { + "type": "UncategorizedText", + "element_id": "6b847a0ed0b2c484c73f2749e29b4db5", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "into" + }, + { + "type": "UncategorizedText", + "element_id": "b63b99f6383ba713b57ddfc77737c5f7", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "was" + }, + { + "type": "UncategorizedText", + "element_id": "936e5cc5021d8a075f91b7864bf0cec8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "courses" + }, + { + "type": "UncategorizedText", + "element_id": "6b51d431df5d7f141cbececcf79edf3d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "12" + }, + { + "type": "UncategorizedText", + "element_id": "2d2e9ceb1db2bc94a266f3e8b24b8f55", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "c)" + }, + { + "type": "UncategorizedText", + "element_id": "961a38da2886c3cc25091d912769aa0d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad." + }, + { + "type": "UncategorizedText", + "element_id": "f9c94ebffe2ab721a096cf42b7a9cff9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "important skills that were mentioned multiple times in" + }, + { + "type": "UncategorizedText", + "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Desired" + }, + { + "type": "UncategorizedText", + "element_id": "32c1cf49a2feee269ed74dd860f72644", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "skills" + }, + { + "type": "UncategorizedText", + "element_id": "a486fbc90cd5a32fe44275f5948b2066", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "identified" + }, + { + "type": "UncategorizedText", + "element_id": "de98e5ea566225a14a9a6b3086253f6d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "academia" + }, + { + "type": "UncategorizedText", + "element_id": "75857a45899985be4c4d941e90b6b396", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "from" + }, + { + "type": "UncategorizedText", + "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "data" + }, + { + "type": "UncategorizedText", + "element_id": "8b3a4555f5297c340e5fdff392fe5a5b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "science-related" + }, + { + "type": "UncategorizedText", + "element_id": "26f8fe3e12ff690c91f73b24bb45ed01", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "utilized" + }, + { + "type": "UncategorizedText", + "element_id": "b510c96f289ebcf388da7d2dea6a1e73", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "ads." + }, + { + "type": "UncategorizedText", + "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "the" + }, + { + "type": "UncategorizedText", + "element_id": "3e1e967e9b793e908f8eae83c74dba9b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "59" + }, + { + "type": "UncategorizedText", + "element_id": "788eb2efc52660fe41472319f0d2c623", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "ads" + }, + { + "type": "UncategorizedText", + "element_id": "9d5d7fcf3aa35a4809f92551aed1f26e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "sector" + }, + { + "type": "UncategorizedText", + "element_id": "75857a45899985be4c4d941e90b6b396", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "from" + }, + { + "type": "UncategorizedText", + "element_id": "9f25a5b0f5e247294ebcf6723c2169b2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "for core skills necessary for" + }, + { + "type": "UncategorizedText", + "element_id": "f7f4976ebe430b482f073e28add58182", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations competitive biomedical data scientist." + }, + { + "type": "UncategorizedText", + "element_id": "29e4a23a90a3769fc0cad3efcf65d3da", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "' Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com ? In August 2017, Kaggle conducted an industry-wide survey to gain clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index dfdb07440e..f796808df9 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -1,4 +1,14 @@ [ + { + "type": "UncategorizedText", + "element_id": "720a6f5640af3333283ae0a2b6ef5d4d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "a Department of the Treasury Internal Revenue Service" + }, { "type": "Title", "element_id": "88591a76b54e47215c0827ae8838ec13", @@ -69,6 +79,16 @@ }, "text": "Purpose of Form" }, + { + "type": "UncategorizedText", + "element_id": "2ef3cbc8d359155433a0028e73251f95", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "File this form to request a change in your accounting method, including the accounting treatment of any item. If you are requesting a change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods. When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current revision date of Form 3115)," + }, { "type": "NarrativeText", "element_id": "84e7e32f584e2ee9f47ba593bf86c559", @@ -80,94 +100,104 @@ "text": "Generally, applicants must complete Section A. In addition, complete the appropriate sections (B-1 through H) for which a change Is desired." }, { - "type": "ListItem", - "element_id": "7b7c33680de5c4a7cb165c103752579e", + "type": "UncategorizedText", + "element_id": "ed7dba38aff5b289c7b6c8a58e800279", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." + "text": "You must give all relevant facts, including a detailed description of your present and proposed methods. You must also state the reason(s) you believe approval to make the requested change should be granted. Attach additional pages if more space is needed for explanations. Each page should show your name, address, and identifying number. State whether you desire a conference in the National Office if the Service proposes to disapprove your application." }, { "type": "Title", - "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", + "element_id": "242a9dba10a04654d4adef9c58ff96f6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." + "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" }, { - "type": "Title", - "element_id": "5756fb398995bb6518a87637f24f426e", + "type": "NarrativeText", + "element_id": "eb076cfd3d47e546c28611750afedc49", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Time and Place for Filing" + "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and" }, { - "type": "ListItem", - "element_id": "ede9004eceddf828c2c928f62d0687a0", + "type": "UncategorizedText", + "element_id": "0b320308ba52d4a9625d29cadfc941a9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to" + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and" }, { - "type": "Title", - "element_id": "f1a73e2204a114077f988c9da98d7f8b", + "type": "UncategorizedText", + "element_id": "ee134711b01cac75692565ae4f785fd4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Signature" + "text": "the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information." }, { - "type": "Title", - "element_id": "1df7107903f249d938fbf3710f50283a", + "type": "ListItem", + "element_id": "7b7c33680de5c4a7cb165c103752579e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "If the individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." }, { "type": "Title", - "element_id": "242a9dba10a04654d4adef9c58ff96f6", + "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" + "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." }, { "type": "Title", - "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", + "element_id": "5756fb398995bb6518a87637f24f426e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Affiliated Groups" + "text": "Time and Place for Filing" }, { - "type": "Title", - "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "type": "UncategorizedText", + "element_id": "2aebd5bbfbc983d52ed7aee8eb7bc7cc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Specific Instructions Section A" + "text": "Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224. You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "UncategorizedText", + "element_id": "0ec978b05caa71414e2f4429b1d18f09", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "See section 5.03 of Rev. Proc. 84-74 for filing an early application." }, { "type": "Title", @@ -190,14 +220,14 @@ "text": "Late Applications" }, { - "type": "NarrativeText", - "element_id": "eb076cfd3d47e546c28611750afedc49", + "type": "UncategorizedText", + "element_id": "02dd043b5686a46b2f03cfe8cf56aae9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and" + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63." }, { "type": "Title", @@ -209,6 +239,16 @@ }, "text": "Identifying Number" }, + { + "type": "UncategorizedText", + "element_id": "8605ee209656c311cec7ce4b001caab2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." + }, { "type": "NarrativeText", "element_id": "742730130f9c14403ad272eec208a456", @@ -219,75 +259,135 @@ }, "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block." }, + { + "type": "ListItem", + "element_id": "ede9004eceddf828c2c928f62d0687a0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to" + }, { "type": "Title", - "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", + "element_id": "f1a73e2204a114077f988c9da98d7f8b", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "Section E" + "text": "Signature" }, { - "type": "NarrativeText", - "element_id": "1bbe995811e9fd4c3ce1b218cb641f4e", + "type": "Title", + "element_id": "1df7107903f249d938fbf3710f50283a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "If the individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + }, + { + "type": "Title", + "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Affiliated Groups" + }, + { + "type": "UncategorizedText", + "element_id": "58e977f2200b46ac8b372586dfd781bf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Taxpayers that are members of an affiliated group filing a consolidated return that seeks to change to the same accounting method for more than one member of the group must file a separate Form 3115 for each such member," + }, + { + "type": "Title", + "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Specific Instructions Section A" + }, + { + "type": "UncategorizedText", + "element_id": "33b0dd2cec2ea60810343af08d53ded2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a). Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority. Item 7b, page 2.—If item 7b 1s “Yes,” indicate on a separate sheet the following for each separate trade or business: Nature of business" + }, + { + "type": "UncategorizedText", + "element_id": "c51052c424ee3b8b5a219015f66d4846", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) Gives your best estimate of the percentage of the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and" + "text": "(manufacturing, retailer, wholesaler, etc.), employer identification number, overall method of accounting, and whether, in the last 6 years, that business has changed its accounting method, or is also changing its accounting method as part of this request or as a separate request. Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that:" }, { "type": "NarrativeText", - "element_id": "2de8f0b5003bcb8c12a4dc59c8e1f740", + "element_id": "1bbe995811e9fd4c3ce1b218cb641f4e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "See section 5.06(2) of Rev. Proc. 84-74 for the required perjury statement that must be attached." + "text": "(1) Gives your best estimate of the percentage of the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and" }, { - "type": "NarrativeText", - "element_id": "751abc8c6a0fa412c3e8c18345f57f95", + "type": "UncategorizedText", + "element_id": "f7872ac379aa024934461d08fa31ebd9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." + "text": "(2) Explains in detail why you cannot provide the requested information." }, { - "type": "Title", - "element_id": "53e33d10c9df4a570490182ccef0cd95", + "type": "NarrativeText", + "element_id": "2de8f0b5003bcb8c12a4dc59c8e1f740", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Section C" + "text": "See section 5.06(2) of Rev. Proc. 84-74 for the required perjury statement that must be attached." }, { - "type": "ListItem", - "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", + "type": "NarrativeText", + "element_id": "751abc8c6a0fa412c3e8c18345f57f95", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." + "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." }, { - "type": "ListItem", - "element_id": "84cea2af17bb3760234b42f4ea78e175", + "type": "UncategorizedText", + "element_id": "678ecc0340dc8848f891bf12a555a3fd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." + "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time." }, { "type": "Title", @@ -300,14 +400,14 @@ "text": "Section B-1" }, { - "type": "Title", - "element_id": "32786e68a6fd82dc356d2d58bf283dc4", + "type": "UncategorizedText", + "element_id": "e4a695ea83818204438fe08add6d1554", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Section G" + "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application." }, { "type": "Title", @@ -369,6 +469,76 @@ }, "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law," }, + { + "type": "UncategorizedText", + "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)" + }, + { + "type": "UncategorizedText", + "element_id": "64758ada28beed36481b14ce8dc67472", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T." + }, + { + "type": "Title", + "element_id": "53e33d10c9df4a570490182ccef0cd95", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Section C" + }, + { + "type": "UncategorizedText", + "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8." + }, + { + "type": "UncategorizedText", + "element_id": "357d52f500b965abc29ea60039de4fd8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:" + }, + { + "type": "UncategorizedText", + "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726." + }, + { + "type": "UncategorizedText", + "element_id": "6028c579dc843bb5aa2c704f46085914", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event." + }, { "type": "Title", "element_id": "92e21a61e1d872dbbe3e3221a920b409", @@ -379,6 +549,66 @@ }, "text": "Section D" }, + { + "type": "UncategorizedText", + "element_id": "a8e72799229bc2d754f44ea167a6e7d6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods." + }, + { + "type": "Title", + "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Section E" + }, + { + "type": "ListItem", + "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." + }, + { + "type": "ListItem", + "element_id": "84cea2af17bb3760234b42f4ea78e175", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." + }, + { + "type": "Title", + "element_id": "32786e68a6fd82dc356d2d58bf283dc4", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Section G" + }, + { + "type": "UncategorizedText", + "element_id": "fa41a857716f30d6bbee384eada72a90", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)." + }, { "type": "Title", "element_id": "a8155ab3bed92cc259ab58331619e0e1", @@ -389,6 +619,16 @@ }, "text": "Section H" }, + { + "type": "UncategorizedText", + "element_id": "cb1f664a186a87f6560cde136d70b558", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested." + }, { "type": "NarrativeText", "element_id": "86d11953bb813a770ecd242ff97d4e43", @@ -398,5 +638,65 @@ "page_number": 2 }, "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10." + }, + { + "type": "UncategorizedText", + "element_id": "c0a5f5aa4012d18970939d7bb8299e38", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "% U.S." + }, + { + "type": "UncategorizedText", + "element_id": "c71e90d2f497062ba8d068af0bed2a3d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Government" + }, + { + "type": "UncategorizedText", + "element_id": "c0f169737344e28e87eb123df627ba6a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Printing" + }, + { + "type": "UncategorizedText", + "element_id": "749720aad1daf3c5dfeda1d87555ff87", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Office:" + }, + { + "type": "UncategorizedText", + "element_id": "de444aa0e8db0c05d86ad56e28d5fb26", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "1987—201-993/60166" + }, + { + "type": "UncategorizedText", + "element_id": "794f7062cf3f56f2c7d70702bd3d13e1", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Page 2" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index e4e36e67ee..8316f3cde2 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -1,4 +1,14 @@ [ + { + "type": "UncategorizedText", + "element_id": "9e4a454d91ac1f220324c6d1a0377093", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "rh Department of the Treasury Internal Revenue Service" + }, { "type": "Title", "element_id": "88591a76b54e47215c0827ae8838ec13", @@ -9,6 +19,16 @@ }, "text": "Instructions for Form 3115 (Rev. November 1987)" }, + { + "type": "UncategorizedText", + "element_id": "36f63dab0fb1dd098d45e0aa89fa4dd2", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Application for Change in Accoun ig Method" + }, { "type": "NarrativeText", "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", @@ -90,104 +110,124 @@ "text": "Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired." }, { - "type": "ListItem", - "element_id": "4e4069c49822cae18add18758619535b", + "type": "UncategorizedText", + "element_id": "bf2a070cb9d03d056e70b26bebf1ef79", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." + "text": "You must give all relevant facts, including a detailed description of your present and proposed methods. You must also state the reason(s) you believe approval to make the requested change should be granted. Attach additional pages if more space is needed for explanations. Each page should show your name, address, and identifying number." }, { "type": "NarrativeText", - "element_id": "7685df2334a5f6c8c8099dea61a8f1b4", + "element_id": "25f830e7c39c115c9937eb9d11cfb1f2", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." + "text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application" }, { "type": "Title", - "element_id": "5756fb398995bb6518a87637f24f426e", + "element_id": "242a9dba10a04654d4adef9c58ff96f6", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Time and Place for Filing" + "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" }, { - "type": "ListItem", - "element_id": "f8e8c87d2e958a23153d7f25b159f0ee", + "type": "UncategorizedText", + "element_id": "b07efea243933525e9ec04a90622508d", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required" }, { "type": "Title", - "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9", + "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Signature tea" + "text": "Uniform capitalization rules and limitation on" }, { "type": "NarrativeText", - "element_id": "25f830e7c39c115c9937eb9d11cfb1f2", + "element_id": "39458f370b98a606db29ac6dee975e07", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application" + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and" }, { - "type": "Title", - "element_id": "242a9dba10a04654d4adef9c58ff96f6", + "type": "UncategorizedText", + "element_id": "663dd3791cc24190a45998ca7914f88e", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" + "text": "the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3118 eg. “Automatic Change to Accrual Method—Section 448”). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information" }, { - "type": "Title", - "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", + "type": "ListItem", + "element_id": "4e4069c49822cae18add18758619535b", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Affiliated Groups" + "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." }, { - "type": "Title", - "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", + "type": "NarrativeText", + "element_id": "7685df2334a5f6c8c8099dea61a8f1b4", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on" + "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." }, { "type": "Title", - "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "element_id": "5756fb398995bb6518a87637f24f426e", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Specific Instructions Section A" + "text": "Time and Place for Filing" + }, + { + "type": "UncategorizedText", + "element_id": "7a23bee70d81f8f49b74c5a359c1cbf3", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Generally, applicants must file this form within the first 180 days of the tax year in which itis desired to make the change. Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224. You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "UncategorizedText", + "element_id": "df0e66d1a434e95e4051ddcb968c94c9", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "See section 5.03 of Rev. Proc. 84-74 for filing an early application, Note: If this form is being filed in accordance with Rey. Proc. 74-11, see Section G below." }, { "type": "Title", @@ -200,14 +240,24 @@ "text": "Late Applications" }, { - "type": "NarrativeText", - "element_id": "39458f370b98a606db29ac6dee975e07", + "type": "UncategorizedText", + "element_id": "8474975a0cd563b9feee81d0e540ffd3", "metadata": { "data_source": {}, "filetype": "image/png", "page_number": 1 }, - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and" + "text": "If your application is filed after the 180-day period, itis late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63." + }, + { + "type": "UncategorizedText", + "element_id": "ec3c2d03b846d2a186fc9a8f318f688b", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." }, { "type": "Title", @@ -228,5 +278,75 @@ "page_number": 1 }, "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block," + }, + { + "type": "ListItem", + "element_id": "f8e8c87d2e958a23153d7f25b159f0ee", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + }, + { + "type": "Title", + "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Signature tea" + }, + { + "type": "UncategorizedText", + "element_id": "35f1273e073cf159019550bc35b6692c", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Ifthe individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + }, + { + "type": "Title", + "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Affiliated Groups" + }, + { + "type": "UncategorizedText", + "element_id": "762e2a39ed1a3ef5d3d4c83dd5dcc0e8", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Taxpayers that are members of an affiliated group filing a consolidated return that seeks to Change to the same accounting method for more than one member of the group must file a separate Form 3115 for each such member." + }, + { + "type": "Title", + "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Specific Instructions Section A" + }, + { + "type": "UncategorizedText", + "element_id": "5e7793489f88d7c9187dad66e787898f", + "metadata": { + "data_source": {}, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a) Item 6, page 2.—The term “gross receipts” Includes total sales (net of returns and allowances) and all amounts received for services. in addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, tunder the applicable state or local law, the taxis legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority. Item 7b, page 2.—If item 7b 1s \"Yes,\" indicate ona separate sheet the following for each separate trade or business: Nature of business" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 4a151053da..8a0fd4c07d 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -11,33 +11,43 @@ }, { "type": "UncategorizedText", - "element_id": "869adddb184177031536477262e0dde0", + "element_id": "e6b2c238768a8e830492f4de667314bb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Contents lists available at ScienceDirect" + "text": "ELSEVIER" + }, + { + "type": "NarrativeText", + "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "journal homepage: www.elsevier.com/locate/dib" }, { "type": "UncategorizedText", - "element_id": "e6fa42b5b4d85001b900e47c050b645b", + "element_id": "869adddb184177031536477262e0dde0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Data in Brief" + "text": "Contents lists available at ScienceDirect" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "UncategorizedText", + "element_id": "e6fa42b5b4d85001b900e47c050b645b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "journal homepage: www.elsevier.com/locate/dib" + "text": "Data in Brief" }, { "type": "UncategorizedText", @@ -59,6 +69,16 @@ }, "text": "Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment" }, + { + "type": "UncategorizedText", + "element_id": "c21a7f75a507e8d1d940e30b66575616", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "(Jee" + }, { "type": "NarrativeText", "element_id": "4f14d967ea87a75ad1acee27ff34e59e", @@ -149,6 +169,16 @@ }, "text": "Specification table" }, + { + "type": "UncategorizedText", + "element_id": "b877cc5d670d770084dcc0bb41ac73a0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Subject area More specific subject area Type of data" + }, { "type": "UncategorizedText", "element_id": "f6e511a70687b4a8980471d7ad4e43ef", @@ -159,6 +189,16 @@ }, "text": "Materials engineering" }, + { + "type": "UncategorizedText", + "element_id": "a2c3879ecb580742973c6a914fb905bb", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Surface science and engineering" + }, { "type": "UncategorizedText", "element_id": "c319051536721a6e2956050d5c91f7cc", @@ -399,6 +439,16 @@ }, "text": "i" }, + { + "type": "UncategorizedText", + "element_id": "cdf5b8d4e74aae9ab4984aae0466aa75", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(mg)" + }, { "type": "UncategorizedText", "element_id": "f4ccd05b3271c386ee55d9876c745001", @@ -421,23 +471,23 @@ }, { "type": "UncategorizedText", - "element_id": "f82cec9d27e0eb21af4287cfc3f10c63", + "element_id": "917df3320d778ddbaa5c5c7742bc4046", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "10g 8g 6g 4g 2g Control" + "text": "10" }, { "type": "UncategorizedText", - "element_id": "917df3320d778ddbaa5c5c7742bc4046", + "element_id": "f82cec9d27e0eb21af4287cfc3f10c63", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "10" + "text": "10g 8g 6g 4g 2g Control" }, { "type": "UncategorizedText", @@ -529,15 +579,25 @@ }, "text": "453" }, + { + "type": "NarrativeText", + "element_id": "e5d46bc8ceb17f88e1cff33ecac97067", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." + }, { "type": "UncategorizedText", - "element_id": "884784765bb9a529058c24f63946a7e2", + "element_id": "d14506655223461adf0b7bb605d29ca9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "2.7" + "text": "%" }, { "type": "UncategorizedText", @@ -551,343 +611,343 @@ }, { "type": "UncategorizedText", - "element_id": "60f1f45902889fa87ac184f7dd16c609", + "element_id": "ac96469638e152a73cadbf62d44e2f39", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "( e t a r n o s o r r o C" + "text": "(" }, { "type": "UncategorizedText", - "element_id": "50c393f158c3de2db92fa9661bfb00ed", + "element_id": "60f1f45902889fa87ac184f7dd16c609", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "i" + "text": "( e t a r n o s o r r o C" }, { "type": "UncategorizedText", - "element_id": "d2e181481b9515797bc42f282b122d25", + "element_id": "50c393f158c3de2db92fa9661bfb00ed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "1.8" + "text": "i" }, { "type": "UncategorizedText", - "element_id": "a839ab0447b2d415fe1732387938b5e0", + "element_id": "50c393f158c3de2db92fa9661bfb00ed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "0.9" + "text": "i" }, { "type": "UncategorizedText", - "element_id": "f82cec9d27e0eb21af4287cfc3f10c63", + "element_id": "fafba0527c3953c4b6e0e5b5739ba836", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "10g 8g 6g 4g 2g Control" + "text": ")" }, { "type": "UncategorizedText", - "element_id": "68ca3fba3b7e864770cb61aeb306d4bd", + "element_id": "bec5fc8721d3102a76fbe552feadf43c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "24" + "text": "E n o i t i b h n I" }, { "type": "UncategorizedText", - "element_id": "654ee9da442fa353f59f11beb688fc7f", + "element_id": "50c393f158c3de2db92fa9661bfb00ed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "48" + "text": "i" }, { "type": "UncategorizedText", - "element_id": "bd5fa6e75f0ddfcd9ff32e0a2297554c", + "element_id": "679d2bdce77a98d4cf4ad39b5c449e37", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "72" + "text": "y c n e c i f f" }, { "type": "UncategorizedText", - "element_id": "d93844f8f37e55564be3f194656bf33b", + "element_id": "d1eb13b2e82a23a0d7b8618874dec2c8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "96" + "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1" }, { "type": "UncategorizedText", - "element_id": "97b912eb4a61df5f806ca6239dde3e1a", + "element_id": "884784765bb9a529058c24f63946a7e2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "120" + "text": "2.7" }, { "type": "UncategorizedText", - "element_id": "9efe5a55840d37eb5db13a22ccab7e8f", + "element_id": "d2e181481b9515797bc42f282b122d25", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "144" + "text": "1.8" }, { "type": "UncategorizedText", - "element_id": "c02efad74c4db35b2450beec922eb590", + "element_id": "a839ab0447b2d415fe1732387938b5e0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "168" + "text": "0.9" }, { "type": "UncategorizedText", - "element_id": "49d7ac459790bd263dec212b9b71f59c", + "element_id": "eea8254c7500ba3de996aa8ad6af3991", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "192" + "text": "100" }, { "type": "UncategorizedText", - "element_id": "15024228096f537a105444cae5c26bbc", + "element_id": "917df3320d778ddbaa5c5c7742bc4046", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Exposure time" + "text": "10" }, { - "type": "NarrativeText", - "element_id": "e5d46bc8ceb17f88e1cff33ecac97067", + "type": "UncategorizedText", + "element_id": "5378796307535df3ec8d8b15a2e2dc56", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." + "text": "20" }, { "type": "UncategorizedText", - "element_id": "eea8254c7500ba3de996aa8ad6af3991", + "element_id": "f4ccd05b3271c386ee55d9876c745001", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "100" + "text": "30" }, { "type": "UncategorizedText", - "element_id": "4393447bd3c1d55ea7f97417ecb1b36a", + "element_id": "673650f936cb3b0a2f93ce09d81be107", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "90" + "text": "40" }, { "type": "UncategorizedText", - "element_id": "fafba0527c3953c4b6e0e5b5739ba836", + "element_id": "7ea9844ae84eccbf55e8330640865e36", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": ")" + "text": "50" }, { "type": "UncategorizedText", - "element_id": "d14506655223461adf0b7bb605d29ca9", + "element_id": "6442bc26a7c562f5afe6467dab36365c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "%" + "text": "70" }, { "type": "UncategorizedText", - "element_id": "ac96469638e152a73cadbf62d44e2f39", + "element_id": "95cf32708a31caa478a0e9141103ac56", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "(" + "text": "60" }, { "type": "UncategorizedText", - "element_id": "679d2bdce77a98d4cf4ad39b5c449e37", + "element_id": "95aebc97bc646c67fdcd923a5965b001", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "y c n e c i f f" + "text": "80" }, { "type": "UncategorizedText", - "element_id": "50c393f158c3de2db92fa9661bfb00ed", + "element_id": "4393447bd3c1d55ea7f97417ecb1b36a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "i" + "text": "90" }, { "type": "UncategorizedText", - "element_id": "bec5fc8721d3102a76fbe552feadf43c", + "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "E n o i t i b h n I" + "text": "0" }, { "type": "UncategorizedText", - "element_id": "50c393f158c3de2db92fa9661bfb00ed", + "element_id": "68ca3fba3b7e864770cb61aeb306d4bd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "i" + "text": "24" }, { "type": "UncategorizedText", - "element_id": "95aebc97bc646c67fdcd923a5965b001", + "element_id": "654ee9da442fa353f59f11beb688fc7f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "80" + "text": "48" }, { "type": "UncategorizedText", - "element_id": "6442bc26a7c562f5afe6467dab36365c", + "element_id": "bd5fa6e75f0ddfcd9ff32e0a2297554c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "70" + "text": "72" }, { "type": "UncategorizedText", - "element_id": "95cf32708a31caa478a0e9141103ac56", + "element_id": "d93844f8f37e55564be3f194656bf33b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "60" + "text": "96" }, { "type": "UncategorizedText", - "element_id": "7ea9844ae84eccbf55e8330640865e36", + "element_id": "15024228096f537a105444cae5c26bbc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "50" + "text": "Exposure time" }, { "type": "UncategorizedText", - "element_id": "673650f936cb3b0a2f93ce09d81be107", + "element_id": "97b912eb4a61df5f806ca6239dde3e1a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "40" + "text": "120" }, { "type": "UncategorizedText", - "element_id": "f4ccd05b3271c386ee55d9876c745001", + "element_id": "9efe5a55840d37eb5db13a22ccab7e8f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "30" + "text": "144" }, { "type": "UncategorizedText", - "element_id": "6c2020fccd80e546787b8108268cd708", + "element_id": "c02efad74c4db35b2450beec922eb590", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "2g 4g 6g 8g 10g" + "text": "168" }, { "type": "UncategorizedText", - "element_id": "5378796307535df3ec8d8b15a2e2dc56", + "element_id": "49d7ac459790bd263dec212b9b71f59c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "20" + "text": "192" }, { "type": "UncategorizedText", - "element_id": "917df3320d778ddbaa5c5c7742bc4046", + "element_id": "f82cec9d27e0eb21af4287cfc3f10c63", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "10" + "text": "10g 8g 6g 4g 2g Control" }, { "type": "UncategorizedText", - "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", + "element_id": "6c2020fccd80e546787b8108268cd708", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "0" + "text": "2g 4g 6g 8g 10g" }, { "type": "UncategorizedText", @@ -1221,23 +1281,23 @@ }, { "type": "UncategorizedText", - "element_id": "a1fb50e6c86fae1679ef3351296fd671", + "element_id": "4d69c97b539d79ed01e02a13a6b0df76", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "12" + "text": "0 / C" }, { "type": "UncategorizedText", - "element_id": "d30a16d722f11b7f40f526a9f1909b4a", + "element_id": "a1fb50e6c86fae1679ef3351296fd671", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "C/0" + "text": "12" }, { "type": "UncategorizedText", @@ -1261,43 +1321,53 @@ }, { "type": "UncategorizedText", - "element_id": "4d69c97b539d79ed01e02a13a6b0df76", + "element_id": "06e9d52c1720fca412803e3b07c4b228", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0 / C" + "text": "6" }, { "type": "UncategorizedText", - "element_id": "06e9d52c1720fca412803e3b07c4b228", + "element_id": "7de1555df0c2700329e815b93b32c571", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "6" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "7de1555df0c2700329e815b93b32c571", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "4" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "e0f4c6a3d561d8fe1c9526e49896279c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "—=—Cc/0 2+ T T T 1" + }, + { + "type": "UncategorizedText", + "element_id": "d30a16d722f11b7f40f526a9f1909b4a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "C/0" }, { "type": "UncategorizedText", @@ -1400,14 +1470,14 @@ "text": "SEM HV: Q0KY WD: 14.89 rmrm ‘9EM MAO: 209 x Det: DOE Pectomsence In nanospact" }, { - "type": "NarrativeText", - "element_id": "6121f41a05c15afa2efe50af3e838da4", + "type": "FigureCaption", + "element_id": "520d1da08c86ce165cd2843e2dc27f98", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "Fig. 6. SEM/EDX image of as-received stainless steel." + "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE" }, { "type": "FigureCaption", @@ -1430,14 +1500,14 @@ "text": "Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor." }, { - "type": "FigureCaption", - "element_id": "520d1da08c86ce165cd2843e2dc27f98", + "type": "NarrativeText", + "element_id": "6121f41a05c15afa2efe50af3e838da4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE" + "text": "Fig. 6. SEM/EDX image of as-received stainless steel." }, { "type": "NarrativeText", @@ -1579,6 +1649,16 @@ }, "text": "(cid:3)" }, + { + "type": "UncategorizedText", + "element_id": "ba5ec51d07a4ac0e951608704431d59a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": ")" + }, { "type": "UncategorizedText", "element_id": "5ef0df6b867bd357888b9dc42b36cd2d", @@ -1639,6 +1719,16 @@ }, "text": "IE ð%Þ ¼ CRo (cid:3) CR" }, + { + "type": "UncategorizedText", + "element_id": "c13539d1568999137c4e0354795cd37b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "CR" + }, { "type": "UncategorizedText", "element_id": "8c0cb198bc3c84fabd4fb5a938b8917c", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 3ec3b30952..177408c542 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -11,33 +11,43 @@ }, { "type": "UncategorizedText", - "element_id": "869adddb184177031536477262e0dde0", + "element_id": "5566737ab4d91f1c1831fae87f37ec87", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Contents lists available at ScienceDirect" + "text": "ELSEVIER" + }, + { + "type": "NarrativeText", + "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "journal homepage: www.elsevier.com/locate/dib" }, { "type": "UncategorizedText", - "element_id": "e6fa42b5b4d85001b900e47c050b645b", + "element_id": "869adddb184177031536477262e0dde0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Data in Brief" + "text": "Contents lists available at ScienceDirect" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "UncategorizedText", + "element_id": "e6fa42b5b4d85001b900e47c050b645b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "journal homepage: www.elsevier.com/locate/dib" + "text": "Data in Brief" }, { "type": "UncategorizedText", @@ -59,6 +69,16 @@ }, "text": "A benchmark dataset for the multiple depot vehicle scheduling problem" }, + { + "type": "UncategorizedText", + "element_id": "77b037daa0a8a3f7349bd57dda36499f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "(eee" + }, { "type": "NarrativeText", "element_id": "adf50fc70e660740d796f43a2ba5f500", @@ -319,6 +339,36 @@ }, "text": "1. Data" }, + { + "type": "UncategorizedText", + "element_id": "41ce7670e476aaf9a595bc28c13dbba0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number" + }, + { + "type": "UncategorizedText", + "element_id": "a18c70d23b71c51ddfe33311232c241c", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." + }, + { + "type": "UncategorizedText", + "element_id": "10c22bcf4c768b515be4e94bcafc71bf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "for" + }, { "type": "UncategorizedText", "element_id": "4443614d5e9dada0ac5245412ed35771", @@ -469,6 +519,16 @@ }, "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, m r l and the locations 1; …; m correspond to depots, while the remaining locations only appear as trip start and end locations." }, + { + "type": "UncategorizedText", + "element_id": "faee1001fc912565a74ea2d69fa0d689", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "travel empty from —¢). Aschedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" + }, { "type": "UncategorizedText", "element_id": "f41b22faaa980f33f59eae3874a19ce3", @@ -539,6 +599,16 @@ }, "text": "a ls" }, + { + "type": "UncategorizedText", + "element_id": "4137b01e139589b7a1d3b3fc4da031d8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "must" + }, { "type": "UncategorizedText", "element_id": "2ec583faff6571c9c19100202efae904", @@ -829,6 +899,16 @@ }, "text": "1 1 n" }, + { + "type": "UncategorizedText", + "element_id": "6d7ebc44c5bc26207e62f4f628f912e1", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "l" + }, { "type": "UncategorizedText", "element_id": "41a69e1feefeb9fc3da6681575258ad3", @@ -841,43 +921,53 @@ }, { "type": "UncategorizedText", - "element_id": "2403ade754d57e65d82387d4e14d226f", + "element_id": "6d7ebc44c5bc26207e62f4f628f912e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "i , the start" + "text": "l" }, { "type": "UncategorizedText", - "element_id": "0df3cce02f5a4b99df9519bfff5cdfc1", + "element_id": "78f6ff03dfac8dfb7f319de1e369590d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "i , the end location le" + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." }, { "type": "UncategorizedText", - "element_id": "6d7ebc44c5bc26207e62f4f628f912e1", + "element_id": "336074805fc853987abe6f7fe3ad97a6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "l" + "text": "time" }, { "type": "UncategorizedText", - "element_id": "6d7ebc44c5bc26207e62f4f628f912e1", + "element_id": "0df3cce02f5a4b99df9519bfff5cdfc1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "l" + "text": "i , the end location le" + }, + { + "type": "UncategorizedText", + "element_id": "2403ade754d57e65d82387d4e14d226f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "i , the start" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index f4c481c83e..b9d3eb4ecf 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -171,43 +171,43 @@ }, { "type": "NarrativeText", - "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", + "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel." + "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." }, { "type": "NarrativeText", - "element_id": "836e6ef5cecc9a73356c0d5bee181829", + "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:" + "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel." }, { - "type": "UncategorizedText", - "element_id": "71c0c5ffbb8105ccb9f3bd1543f59007", + "type": "NarrativeText", + "element_id": "836e6ef5cecc9a73356c0d5bee181829", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character" + "text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:" }, { "type": "UncategorizedText", - "element_id": "798b60ffa3907be6de6b739de4b6ae6b", + "element_id": "71c0c5ffbb8105ccb9f3bd1543f59007", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "recognition, and other DIA tasks (Section 3)" + "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character" }, { "type": "UncategorizedText", @@ -221,53 +221,53 @@ }, { "type": "UncategorizedText", - "element_id": "a755646a34c86b2fb223ed3040821c4b", + "element_id": "beba83994ae4b4055cfc52903455a858", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "underlies the off-the-shelf usage" + "text": "3. Comprehensive tools for efficient document image data annotation and model" }, { "type": "UncategorizedText", - "element_id": "beba83994ae4b4055cfc52903455a858", + "element_id": "18b1855acfb386ae6e6a253da566e93b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "3. Comprehensive tools for efficient document image data annotation and model" + "text": "4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)" }, { "type": "UncategorizedText", - "element_id": "3ecbd234f3fb48c8974ca103ce412060", + "element_id": "798b60ffa3907be6de6b739de4b6ae6b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "tuning to support different levels of customization" + "text": "recognition, and other DIA tasks (Section 3)" }, { "type": "UncategorizedText", - "element_id": "18b1855acfb386ae6e6a253da566e93b", + "element_id": "a755646a34c86b2fb223ed3040821c4b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)" + "text": "underlies the off-the-shelf usage" }, { - "type": "NarrativeText", - "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", + "type": "UncategorizedText", + "element_id": "3ecbd234f3fb48c8974ca103ce412060", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." + "text": "tuning to support different levels of customization" }, { "type": "NarrativeText", @@ -562,33 +562,33 @@ }, { "type": "NarrativeText", - "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", + "element_id": "11dff8778699e76422be6b86c9eaa62a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." + "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" }, { - "type": "Title", - "element_id": "9f26ca353a2c130a2e32f457d71c1350", + "type": "NarrativeText", + "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "3.1 Layout Detection Models" + "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." }, { - "type": "NarrativeText", - "element_id": "11dff8778699e76422be6b86c9eaa62a", + "type": "Title", + "element_id": "9f26ca353a2c130a2e32f457d71c1350", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" + "text": "3.1 Layout Detection Models" }, { "type": "UncategorizedText", @@ -650,16 +650,6 @@ }, "text": "Z. Shen et al." }, - { - "type": "FigureCaption", - "element_id": "d21661161ae2c8dc39e96ee5c660704b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" - }, { "type": "NarrativeText", "element_id": "cafae07120d714f0822e89865adf62da", @@ -700,6 +690,16 @@ }, "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to efficiently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the final outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide different levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes." }, + { + "type": "FigureCaption", + "element_id": "d21661161ae2c8dc39e96ee5c660704b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" + }, { "type": "UncategorizedText", "element_id": "4c2478cf439baab6ace34761eda527d9", @@ -722,13 +722,13 @@ }, { "type": "NarrativeText", - "element_id": "f2a3e5fbb983d9132dddecc381ed6e0b", + "element_id": "e284bd66511cfa064681253e7ac57a9a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13." + "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" }, { "type": "NarrativeText", @@ -752,13 +752,13 @@ }, { "type": "NarrativeText", - "element_id": "e284bd66511cfa064681253e7ac57a9a", + "element_id": "f2a3e5fbb983d9132dddecc381ed6e0b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" + "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13." }, { "type": "ListItem", @@ -1413,33 +1413,33 @@ }, { "type": "NarrativeText", - "element_id": "07be9fda679b805e67cf5e563eada033", + "element_id": "069379b2abcf2bed44f13bdaea90ec2d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set." + "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." }, { - "type": "NarrativeText", - "element_id": "069379b2abcf2bed44f13bdaea90ec2d", + "type": "UncategorizedText", + "element_id": "0575dcbfd82c79a56a8028d0af3cbe07", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." + "text": "16 This measures the overlap between the detected and ground-truth characters, and" }, { - "type": "UncategorizedText", - "element_id": "0575dcbfd82c79a56a8028d0af3cbe07", + "type": "NarrativeText", + "element_id": "07be9fda679b805e67cf5e563eada033", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "16 This measures the overlap between the detected and ground-truth characters, and" + "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set." }, { "type": "UncategorizedText", @@ -1663,33 +1663,33 @@ }, { "type": "UncategorizedText", - "element_id": "ed850ded537aa0ac53c62e095a412dc4", + "element_id": "b000578a41ffcc554faac04609d2f4e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "Hierarchical Image Database. In: CVPR09 (2009)" + "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" }, { - "type": "UncategorizedText", - "element_id": "b000578a41ffcc554faac04609d2f4e1", + "type": "NarrativeText", + "element_id": "c6e835fe03323406543926cc0f5a94de", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" + "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" }, { - "type": "NarrativeText", - "element_id": "c6e835fe03323406543926cc0f5a94de", + "type": "UncategorizedText", + "element_id": "ed850ded537aa0ac53c62e095a412dc4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" + "text": "Hierarchical Image Database. In: CVPR09 (2009)" }, { "type": "UncategorizedText", @@ -1713,163 +1713,163 @@ }, { "type": "UncategorizedText", - "element_id": "a24a28ea9e86cd280c8f7887ad2d0d99", + "element_id": "901e0c80da6bc4ab586f53c474e71426", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" + "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." }, { "type": "UncategorizedText", - "element_id": "ca3e69faa45c756ca454e118ff12f597", + "element_id": "7a0afd734c99f6b076dc58b2e57cfec6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[9]" + "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" }, { "type": "UncategorizedText", - "element_id": "c8f5863d94cc9b9d77f153c6d1b0015a", + "element_id": "00c7abdd98fedd1746994d16ca44d45f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" + "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" }, { "type": "UncategorizedText", - "element_id": "91c801cdf5af0b4fccd7f65d711d447a", + "element_id": "257e7b8aef89c41e03bf837ea517885e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" + "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" }, { "type": "UncategorizedText", - "element_id": "ecabb9a495995008845e44ab44bbda42", + "element_id": "df18427a8013b4df36e8ac4e2ee5da3a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" + "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" }, { "type": "UncategorizedText", - "element_id": "7ceaba2290e3f9c5f3754032ce4d5663", + "element_id": "0aabfb2a8e358618179ec2e1d322e519", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" + "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" }, { "type": "UncategorizedText", - "element_id": "901e0c80da6bc4ab586f53c474e71426", + "element_id": "1f1a0fac1bae95f076ea34c955551632", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." + "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" }, { "type": "UncategorizedText", - "element_id": "2785f642a6abf514e88b8560ac84509f", + "element_id": "00d6ff1b3fb21f8a608f3b6269df56be", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "2007(159), 2 (Jul 2007)" + "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" }, { "type": "UncategorizedText", - "element_id": "1f1a0fac1bae95f076ea34c955551632", + "element_id": "deecdfacbce71dd1425fd54010b2fad1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" + "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" }, { "type": "UncategorizedText", - "element_id": "0aabfb2a8e358618179ec2e1d322e519", + "element_id": "91c801cdf5af0b4fccd7f65d711d447a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" + "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" }, { "type": "UncategorizedText", - "element_id": "df18427a8013b4df36e8ac4e2ee5da3a", + "element_id": "c8f5863d94cc9b9d77f153c6d1b0015a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" + "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" }, { "type": "UncategorizedText", - "element_id": "257e7b8aef89c41e03bf837ea517885e", + "element_id": "7ceaba2290e3f9c5f3754032ce4d5663", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" + "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" }, { "type": "UncategorizedText", - "element_id": "00c7abdd98fedd1746994d16ca44d45f", + "element_id": "ca3e69faa45c756ca454e118ff12f597", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" + "text": "[9]" }, { "type": "UncategorizedText", - "element_id": "7a0afd734c99f6b076dc58b2e57cfec6", + "element_id": "a24a28ea9e86cd280c8f7887ad2d0d99", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" + "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" }, { "type": "UncategorizedText", - "element_id": "00d6ff1b3fb21f8a608f3b6269df56be", + "element_id": "2785f642a6abf514e88b8560ac84509f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" + "text": "2007(159), 2 (Jul 2007)" }, { "type": "UncategorizedText", - "element_id": "deecdfacbce71dd1425fd54010b2fad1", + "element_id": "ecabb9a495995008845e44ab44bbda42", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" + "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" }, { "type": "UncategorizedText", @@ -1883,13 +1883,13 @@ }, { "type": "UncategorizedText", - "element_id": "22364b7a1d2b35282b360d61ae08e2b9", + "element_id": "24e0da607349c44bf53c752dd897fc58", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "Z. Shen et al." + "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" }, { "type": "UncategorizedText", @@ -1963,93 +1963,93 @@ }, { "type": "UncategorizedText", - "element_id": "53dd155fea5e56f1dc9715fab422339a", + "element_id": "4c8ddc159ec208bb7f454603fcd7c4bd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" + "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)" }, { "type": "UncategorizedText", - "element_id": "4c8ddc159ec208bb7f454603fcd7c4bd", + "element_id": "1cd3d88315a8510c5fd1cbc7784db861", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)" + "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://" }, { "type": "UncategorizedText", - "element_id": "1cd3d88315a8510c5fd1cbc7784db861", + "element_id": "6c94dd219ce339c358163833e20d099e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://" + "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" }, { "type": "UncategorizedText", - "element_id": "0626b5a309a35467b0f463c7e36114ce", + "element_id": "13767118077be05d9be07792d1785ecb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "github.com/facebookresearch/detectron2 (2019)" + "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" }, { "type": "UncategorizedText", - "element_id": "6c94dd219ce339c358163833e20d099e", + "element_id": "869d8bf2a40ba100a2d83b864d9a654f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" + "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" }, { "type": "UncategorizedText", - "element_id": "13767118077be05d9be07792d1785ecb", + "element_id": "df2e0c46ac32660311c5e8b2bee4c16d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" + "text": "text and layout for document image understanding (2019)" }, { "type": "UncategorizedText", - "element_id": "df2e0c46ac32660311c5e8b2bee4c16d", + "element_id": "53dd155fea5e56f1dc9715fab422339a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "text and layout for document image understanding (2019)" + "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" }, { "type": "UncategorizedText", - "element_id": "24e0da607349c44bf53c752dd897fc58", + "element_id": "0626b5a309a35467b0f463c7e36114ce", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" + "text": "github.com/facebookresearch/detectron2 (2019)" }, { "type": "UncategorizedText", - "element_id": "869d8bf2a40ba100a2d83b864d9a654f", + "element_id": "22364b7a1d2b35282b360d61ae08e2b9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" + "text": "Z. Shen et al." }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index c8d0b34958..1325448990 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -29,6 +29,16 @@ }, "text": "2023 JAN" }, + { + "type": "UncategorizedText", + "element_id": "dc3e35f42d4566d77a9bf08d9ce44677", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE" + }, { "type": "Title", "element_id": "12d4f57c3f43b0afbdf88305940258bc", @@ -129,6 +139,36 @@ }, "text": "In the fourth quarter of 2022, however, this uptick is estimated to have faded in most—though not all––major economies. US growth remains stronger than expected, with consumers continuing to spend from their stock of savings (the personal saving rate is at its lowest in more than 60 years, except for July 2005), unemployment near historic lows, and plentiful job opportunities. But elsewhere, high-frequency activity indicators (such as business and consumer sentiment, purchasing manager surveys, and mobility indicators) generally point to a slowdown." }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "1" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "NarrativeText", "element_id": "15d7968ef76d05b9b7d490cd2ebe6550", @@ -419,6 +459,36 @@ }, "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience––which is" }, + { + "type": "UncategorizedText", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "2" + }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "NarrativeText", "element_id": "83ce77349b07c275543d551c2c016370", @@ -509,6 +579,36 @@ }, "text": "In the United States, growth is projected to fall from 2.0 percent in 2022 to 1.4 percent in 2023 and 1.0 percent in 2024. With growth rebounding in the second half of 2024, growth in 2024 will be faster than in 2023 on a fourth-quarter-over-fourth-quarter basis, as in most advanced" }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "3" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "UncategorizedText", "element_id": "67f04acf5353c625d003fd003acb56f3", @@ -629,6 +729,36 @@ }, "text": "" }, + { + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "4" + }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "UncategorizedText", "element_id": "c9b8a2f221ce7ec3213fcf4d9ce8879c", @@ -639,6 +769,16 @@ }, "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." }, + { + "type": "UncategorizedText", + "element_id": "3f79bb7b435b05321651daefd374cdc6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "e" + }, { "type": "UncategorizedText", "element_id": "dc5a043db14fe0dd4f128194aa8a9b77", @@ -649,6 +789,16 @@ }, "text": "" }, + { + "type": "UncategorizedText", + "element_id": "25e2f1dc031b5421b8a234945098e58b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." + }, { "type": "Title", "element_id": "3dfc45d3333ae253d78008c8cde2d752", @@ -699,6 +849,36 @@ }, "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook." }, + { + "type": "UncategorizedText", + "element_id": "8ae18586f23aa212e66aeb12a5638609", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "International Monetary Fund | January 2023." + }, + { + "type": "UncategorizedText", + "element_id": "ef2d127de37b942baad06145e54b0c61", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 6 + }, + "text": "5" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "NarrativeText", "element_id": "1ad611b76683e54171ae0b1fddd827ca", @@ -1081,43 +1261,43 @@ }, { "type": "UncategorizedText", - "element_id": "c516993a0b87171b00b1077d5dd6d74c", + "element_id": "adbebe93f3dd02702155f1137d795bee", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "2.4 0.7 4.3 3.2 4.0 4.9" + "text": "6.6 4.6 8.1" }, { "type": "UncategorizedText", - "element_id": "adbebe93f3dd02702155f1137d795bee", + "element_id": "3f2b6e9cef343067dfeca9879f9b84c1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "6.6 4.6 8.1" + "text": "4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2" }, { "type": "UncategorizedText", - "element_id": "edc938a1a24c61a1194b6c639e3990d6", + "element_id": "c516993a0b87171b00b1077d5dd6d74c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "2.9" + "text": "2.4 0.7 4.3 3.2 4.0 4.9" }, { "type": "UncategorizedText", - "element_id": "3f2b6e9cef343067dfeca9879f9b84c1", + "element_id": "edc938a1a24c61a1194b6c639e3990d6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2" + "text": "2.9" }, { "type": "UncategorizedText", @@ -1371,23 +1551,23 @@ }, { "type": "UncategorizedText", - "element_id": "79b864b70b3eef393667d557fa5270ed", + "element_id": "80dcbb0dc10d8c1c50d9abaca086b5b6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2" + "text": "–0.9 0.3" }, { "type": "UncategorizedText", - "element_id": "80dcbb0dc10d8c1c50d9abaca086b5b6", + "element_id": "79b864b70b3eef393667d557fa5270ed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.9 0.3" + "text": "–0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2" }, { "type": "UncategorizedText", @@ -1461,23 +1641,23 @@ }, { "type": "UncategorizedText", - "element_id": "a3d3441a9c8b1f9f5815fd747f7173b0", + "element_id": "749499bcae2a8ac4114de139a7375ab3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "1.7 1.8 3.7 . . . 2.5 . . ." + "text": "1.9" }, { "type": "UncategorizedText", - "element_id": "749499bcae2a8ac4114de139a7375ab3", + "element_id": "a3d3441a9c8b1f9f5815fd747f7173b0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "1.9" + "text": "1.7 1.8 3.7 . . . 2.5 . . ." }, { "type": "UncategorizedText", @@ -1521,53 +1701,53 @@ }, { "type": "UncategorizedText", - "element_id": "40696241a92836702680491eb3d6f31d", + "element_id": "2ebea8c404fdbf0f73f7190810d22cf8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5" + "text": "5.0 3.1 6.6" }, { "type": "UncategorizedText", - "element_id": "2ebea8c404fdbf0f73f7190810d22cf8", + "element_id": "bdcf83d42c2f79ce5b532b91ece15d53", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "5.0 3.1 6.6" + "text": "2.5 1.2 5.7 . . . 5.0 . . ." }, { "type": "UncategorizedText", - "element_id": "73356c80e67292eab1403c43e9bbb6fa", + "element_id": "56010fa3453e21765ccdf5076960ad2c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": ". . . . . . . . ." + "text": "3.2" }, { "type": "UncategorizedText", - "element_id": "56010fa3453e21765ccdf5076960ad2c", + "element_id": "73356c80e67292eab1403c43e9bbb6fa", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "3.2" + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", - "element_id": "bdcf83d42c2f79ce5b532b91ece15d53", + "element_id": "40696241a92836702680491eb3d6f31d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "2.5 1.2 5.7 . . . 5.0 . . ." + "text": "5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5" }, { "type": "UncategorizedText", @@ -1591,63 +1771,63 @@ }, { "type": "UncategorizedText", - "element_id": "73356c80e67292eab1403c43e9bbb6fa", + "element_id": "97b19eab144ecd45afce13280ebf73f0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": ". . . . . . . . ." + "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" }, { "type": "UncategorizedText", - "element_id": "97b19eab144ecd45afce13280ebf73f0", + "element_id": "54add9deb165bffe2c1e635146114e1d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" + "text": "2.5 2.0 4.0 . . . 4.1 . . ." }, { "type": "UncategorizedText", - "element_id": "54add9deb165bffe2c1e635146114e1d", + "element_id": "df7318d8afd930b2f2049127f4e90e0e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "2.5 2.0 4.0 . . . 4.1 . . ." + "text": "3.5 2.3 4.5" }, { "type": "UncategorizedText", - "element_id": "15e586af718d7b1b04fc8fb83f83f814", + "element_id": "82eb710aeec934b729906546268a7087", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" + "text": "3.0" }, { "type": "UncategorizedText", - "element_id": "82eb710aeec934b729906546268a7087", + "element_id": "15e586af718d7b1b04fc8fb83f83f814", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "3.0" + "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" }, { "type": "UncategorizedText", - "element_id": "df7318d8afd930b2f2049127f4e90e0e", + "element_id": "73356c80e67292eab1403c43e9bbb6fa", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "3.5 2.3 4.5" + "text": ". . . . . . . . ." }, { "type": "NarrativeText", @@ -1689,6 +1869,36 @@ }, "text": "support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism." }, + { + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "6" + }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "UncategorizedText", "element_id": "a6e6e147daf229e8267d85c3e49f7250", @@ -1820,14 +2030,44 @@ "text": "earlier geopolitical tensions, such as those associated with the US-China trade dispute." }, { - "type": "NarrativeText", - "element_id": "6684fee3e3cd949ec59e7444a0c3fd0c", + "type": "UncategorizedText", + "element_id": "8ae18586f23aa212e66aeb12a5638609", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "Fragmentation could intensify—with more restrictions on cross-border movements of capital, workers, and international payments—and could hamper multilateral cooperation on providing global public goods.1 The costs of such fragmentation are especially high in the short term, as replacing disrupted cross-border flows takes time." + "text": "International Monetary Fund | January 2023." + }, + { + "type": "UncategorizedText", + "element_id": "7902699be42c8a8e46fbbb4501726517", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "7" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, + { + "type": "NarrativeText", + "element_id": "6684fee3e3cd949ec59e7444a0c3fd0c", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "Fragmentation could intensify—with more restrictions on cross-border movements of capital, workers, and international payments—and could hamper multilateral cooperation on providing global public goods.1 The costs of such fragmentation are especially high in the short term, as replacing disrupted cross-border flows takes time." }, { "type": "Title", @@ -1899,6 +2139,36 @@ }, "text": "1 See “Geo-Economic Fragmentation and the Future of Multilateralism,” IMF Staff Discussion Note 2023/001." }, + { + "type": "UncategorizedText", + "element_id": "2c624232cdd221771294dfbb310aca00", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "8" + }, + { + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "International Monetary Fund | January 2023" + }, + { + "type": "UncategorizedText", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 10 + }, + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + }, { "type": "NarrativeText", "element_id": "2e9a0eaddd75095d1bbb4fda6f2c4feb", @@ -1959,6 +2229,26 @@ }, "text": " Strengthening global trade: Strengthening the global trading system would address risks associated" }, + { + "type": "UncategorizedText", + "element_id": "e6f343736720ae4f9bf5202294c7c9fc", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 10 + }, + "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." + }, + { + "type": "UncategorizedText", + "element_id": "0695b563acde461fc2f8d9aebccf35c7", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 10 + }, + "text": "with" + }, { "type": "UncategorizedText", "element_id": "a5751b5964fbbc37b14db4811aeb37f4", @@ -1990,134 +2280,144 @@ "text": "implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, { - "type": "NarrativeText", - "element_id": "a2fa3a13e51ab7dd0859ee2c869b70e5", + "type": "UncategorizedText", + "element_id": "b3080428cb4e8896623bf36c001e868a", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." + "text": "International Monetary Fund | January 2023" }, { "type": "UncategorizedText", - "element_id": "be54412821e5c5c1d8dd726e2a1518f1", + "element_id": "19581e27de7ced00ff1ce50b2047e7a5", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." + "text": "9" }, { "type": "UncategorizedText", - "element_id": "f79a09409db68af141e82d9ac113ded8", + "element_id": "591fe13f580ac8ffd1d3902f5f74c6f1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" + "text": "BOX 1. GL AL FINANCIAL STABILITY UPDATE —— — other" + }, + { + "type": "NarrativeText", + "element_id": "a2fa3a13e51ab7dd0859ee2c869b70e5", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." }, { "type": "UncategorizedText", - "element_id": "10159baf262b43a92d95db59dae1f72c", + "element_id": "375adf2b48cfbcfa82de8c38d2d31569", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "7" + "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, { "type": "UncategorizedText", - "element_id": "06e9d52c1720fca412803e3b07c4b228", + "element_id": "be54412821e5c5c1d8dd726e2a1518f1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { "type": "UncategorizedText", - "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", + "element_id": "4355a46b19d348dc2f57c046f8ef63d4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "7de1555df0c2700329e815b93b32c571", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "3bb936c1a0a051d8786700b630295974", + "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "United States Euro area China Other AEs Other EMs" + "text": "3" }, { "type": "UncategorizedText", - "element_id": "32d9d432ea30d4913ea73770664638a6", + "element_id": "7de1555df0c2700329e815b93b32c571", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", + "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "06e9d52c1720fca412803e3b07c4b228", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "6" }, { "type": "UncategorizedText", - "element_id": "4355a46b19d348dc2f57c046f8ef63d4", + "element_id": "9511bbd64fb873accac793ad8191b19b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { "type": "UncategorizedText", - "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", + "element_id": "f79a09409db68af141e82d9ac113ded8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "0" + "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" }, { "type": "UncategorizedText", @@ -2151,283 +2451,283 @@ }, { "type": "UncategorizedText", - "element_id": "b7b22a1a0dd52d65e13ef53d11836932", + "element_id": "d78f392a386b26aa260548d71936abff", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "2006 08 08" + "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." }, { "type": "UncategorizedText", - "element_id": "09249963490d90835afd8926fbb61e62", + "element_id": "b1b074f02d43a5ace5633ff6df86e7c8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "06" + "text": "Oct. 22" }, { "type": "UncategorizedText", - "element_id": "dfdb0f123cf8b8b2bb36f70da82b8804", + "element_id": "10159baf262b43a92d95db59dae1f72c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "10 10" + "text": "7" }, { "type": "UncategorizedText", - "element_id": "cdcd7da134ed48b36d35843e9f31af03", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "12 12" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "0ec960fab1d2953e7149d0f3c7024364", + "element_id": "06e9d52c1720fca412803e3b07c4b228", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "14 16 14" + "text": "6" }, { "type": "UncategorizedText", - "element_id": "e6c21e8d260fe71882debdb339d2402a", + "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "16" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "ea079c2747fa5ebeedffbc53cba88970", + "element_id": "7de1555df0c2700329e815b93b32c571", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "18 18" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "c233ba96b340d716275a45f83a22708d", + "element_id": "b7b22a1a0dd52d65e13ef53d11836932", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "20 22 22" + "text": "2006 08 08" }, { "type": "UncategorizedText", - "element_id": "5378796307535df3ec8d8b15a2e2dc56", + "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "20" + "text": "0" }, { "type": "UncategorizedText", - "element_id": "d78f392a386b26aa260548d71936abff", + "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." + "text": "3" }, { "type": "UncategorizedText", - "element_id": "9511bbd64fb873accac793ad8191b19b", + "element_id": "4355a46b19d348dc2f57c046f8ef63d4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "c3e0fd2d4e37ea55e6ebfcf6c544a70e", + "element_id": "09249963490d90835afd8926fbb61e62", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Latest" + "text": "06" }, { "type": "UncategorizedText", - "element_id": "08f6e8fb9ba5e2b9c1e4eb6696d00610", + "element_id": "fbd1fa1628fb372ebbbdd0cd0b6a2319", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "Apr. 23" }, { "type": "UncategorizedText", - "element_id": "06e9d52c1720fca412803e3b07c4b228", + "element_id": "de5fef2c6bd3b387d19639dbc8784016", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "1. United States" }, { "type": "UncategorizedText", - "element_id": "de5fef2c6bd3b387d19639dbc8784016", + "element_id": "7b7dee8a185c9d1212af8b334518d5e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "1. United States" + "text": "Oct. 23" }, { "type": "UncategorizedText", - "element_id": "d91dca464ef07b2054c553f896fdb9ca", + "element_id": "c3e0fd2d4e37ea55e6ebfcf6c544a70e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "2. Euro area" + "text": "Latest" }, { "type": "UncategorizedText", - "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", + "element_id": "dfdb0f123cf8b8b2bb36f70da82b8804", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "10 10" }, { "type": "UncategorizedText", - "element_id": "7de1555df0c2700329e815b93b32c571", + "element_id": "d3c564f9dc1a801c1f91d8490a72b2cf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "Dec. 24" }, { "type": "UncategorizedText", - "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", + "element_id": "cdcd7da134ed48b36d35843e9f31af03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "12 12" }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "c69708757c0b20d59b95ce78d00f4fec", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "Dec. 26" }, { "type": "UncategorizedText", - "element_id": "4355a46b19d348dc2f57c046f8ef63d4", + "element_id": "3bb936c1a0a051d8786700b630295974", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "United States Euro area China Other AEs Other EMs" }, { "type": "UncategorizedText", - "element_id": "b1b074f02d43a5ace5633ff6df86e7c8", + "element_id": "0ec960fab1d2953e7149d0f3c7024364", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "14 16 14" }, { "type": "UncategorizedText", - "element_id": "fbd1fa1628fb372ebbbdd0cd0b6a2319", + "element_id": "b1b074f02d43a5ace5633ff6df86e7c8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Oct. 22" }, { "type": "UncategorizedText", - "element_id": "7b7dee8a185c9d1212af8b334518d5e1", + "element_id": "08f6e8fb9ba5e2b9c1e4eb6696d00610", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "October 2022 GFSR" }, { "type": "UncategorizedText", - "element_id": "d3c564f9dc1a801c1f91d8490a72b2cf", + "element_id": "d91dca464ef07b2054c553f896fdb9ca", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "2. Euro area" }, { "type": "UncategorizedText", - "element_id": "c69708757c0b20d59b95ce78d00f4fec", + "element_id": "e6c21e8d260fe71882debdb339d2402a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "16" }, { "type": "UncategorizedText", - "element_id": "b1b074f02d43a5ace5633ff6df86e7c8", + "element_id": "fbd1fa1628fb372ebbbdd0cd0b6a2319", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "Apr. 23" }, { "type": "UncategorizedText", - "element_id": "fbd1fa1628fb372ebbbdd0cd0b6a2319", + "element_id": "ea079c2747fa5ebeedffbc53cba88970", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "18 18" }, { "type": "UncategorizedText", @@ -2439,6 +2739,16 @@ }, "text": "Oct. 23" }, + { + "type": "UncategorizedText", + "element_id": "c233ba96b340d716275a45f83a22708d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "20 22 22" + }, { "type": "UncategorizedText", "element_id": "d3c564f9dc1a801c1f91d8490a72b2cf", @@ -2451,33 +2761,43 @@ }, { "type": "UncategorizedText", - "element_id": "c69708757c0b20d59b95ce78d00f4fec", + "element_id": "5378796307535df3ec8d8b15a2e2dc56", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "20" }, { "type": "UncategorizedText", - "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", + "element_id": "32d9d432ea30d4913ea73770664638a6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "October 2022 GFSR" }, { "type": "UncategorizedText", - "element_id": "7de1555df0c2700329e815b93b32c571", + "element_id": "c69708757c0b20d59b95ce78d00f4fec", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "Dec. 26" + }, + { + "type": "UncategorizedText", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "2" }, { "type": "UncategorizedText", @@ -2491,13 +2811,13 @@ }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "7de1555df0c2700329e815b93b32c571", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "4" }, { "type": "UncategorizedText", @@ -2509,6 +2829,16 @@ }, "text": "1" }, + { + "type": "UncategorizedText", + "element_id": "f0b5c2c2211c8d67ed15e75e656c7862", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "5" + }, { "type": "NarrativeText", "element_id": "a404b982431c5d79e96fa2c0fdd1544d", @@ -2519,6 +2849,16 @@ }, "text": "Financial market volatility is expected to remain elevated and could be exacerbated by poor market liquidity. For some asset classes (such as US Treasuries), liquidity has deteriorated to the March 2020 lows of the COVID-19 pandemic. With the process of central bank balance sheet reduction (quantitative tightening) underway, market liquidity is expected to remain challenging." }, + { + "type": "UncategorizedText", + "element_id": "bab943d841e99d44807adb96ef9ef925", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "10 — International Monetary Fund | January 2023" + }, { "type": "UncategorizedText", "element_id": "06d12185958a014c0c9d6afeab7426c2", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index e216c1e597..39350f940e 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -1,4 +1,14 @@ [ + { + "type": "UncategorizedText", + "element_id": "4c4df5a5d648146ad05a437213014e72", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "WORLD NUCLEAR //s88ciation" + }, { "type": "UncategorizedText", "element_id": "51174df4a3a78fe261885b1818b66876", @@ -51,23 +61,23 @@ }, { "type": "NarrativeText", - "element_id": "ae77460bce2d3a52d823954ccb9c708f", + "element_id": "8e1e0570b2ba9211cc184c21a3ffbf90", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Energy is the essential agent for promoting human development, and global demand is projected to increase significantly in the coming decades. Securing access to modern and affordable energy is essential for lifting people out of poverty, and for promoting energy independence and economic growth." + "text": "Nuclear energy is a proven solution with a long and well-established track record. Nuclear reactors – a grand total of 445 in 30 countries – are the low-carbon backbone of electricity systems, operating in the background, day in and day out, often out of sight and out of mind. Capable of generating immense amounts of clean power, they are the silent giants upon which we rely daily." }, { "type": "NarrativeText", - "element_id": "8e1e0570b2ba9211cc184c21a3ffbf90", + "element_id": "ae77460bce2d3a52d823954ccb9c708f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Nuclear energy is a proven solution with a long and well-established track record. Nuclear reactors – a grand total of 445 in 30 countries – are the low-carbon backbone of electricity systems, operating in the background, day in and day out, often out of sight and out of mind. Capable of generating immense amounts of clean power, they are the silent giants upon which we rely daily." + "text": "Energy is the essential agent for promoting human development, and global demand is projected to increase significantly in the coming decades. Securing access to modern and affordable energy is essential for lifting people out of poverty, and for promoting energy independence and economic growth." }, { "type": "NarrativeText", @@ -181,23 +191,23 @@ }, { "type": "UncategorizedText", - "element_id": "3034a0577f73a1b5076e3a6e4e209252", + "element_id": "62f86400f0347bdbe07e40c3063fd3bb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "40,000" + "text": "h W T" }, { "type": "UncategorizedText", - "element_id": "8635a2005fe7dc2491c5d0f888d863ca", + "element_id": "3034a0577f73a1b5076e3a6e4e209252", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " CSP" + "text": "40,000" }, { "type": "UncategorizedText", @@ -211,223 +221,293 @@ }, { "type": "UncategorizedText", - "element_id": "3c01e983d1c0b6e1f7e5009554c15bfc", + "element_id": "ba980165f95d647935bbb000d0a9bf57", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Solar PV" + "text": "30,000" }, { "type": "UncategorizedText", - "element_id": "24fda359cfaadc2b4765e4d96cc580bd", + "element_id": "e5a636fefc8e5e0d9483c6ffdfe6fa2d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Geothermal" + "text": "25,000" }, { "type": "UncategorizedText", - "element_id": "ba980165f95d647935bbb000d0a9bf57", + "element_id": "6ea17978f8b5646d74e3640938409b0b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000" + "text": "20,000" }, { "type": "UncategorizedText", - "element_id": "b4686d8980288cc18e25dd88862018fb", + "element_id": "f6e7c5652bc0607d83b62e5075496c9a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Wind" + "text": "15,000" }, { "type": "UncategorizedText", - "element_id": "62f86400f0347bdbe07e40c3063fd3bb", + "element_id": "ebf087fd620248057b4b331f6ff855ce", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "h W T" + "text": "10,000" }, { "type": "UncategorizedText", - "element_id": "e5a636fefc8e5e0d9483c6ffdfe6fa2d", + "element_id": "47af55050b9d14f32f04c4a2745263ae", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "25,000" + "text": "5,000" }, { "type": "UncategorizedText", - "element_id": "6a5e5dc7ebe4536d6c4a8358ceae6b52", + "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Bioenergy" + "text": "0" }, { "type": "UncategorizedText", - "element_id": "6ea17978f8b5646d74e3640938409b0b", + "element_id": "4a60bf7d4bc1e485744cf7e8d0860524", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "20,000" + "text": "zz" }, { "type": "UncategorizedText", - "element_id": "f3869bfc418f6b38b211f34af7bbbee9", + "element_id": "7ace431cb61584cb9b8dc7ec08cf38ac", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Hydro" + "text": "~" }, { "type": "UncategorizedText", - "element_id": "b1dad634c86b14162ce382a54d48adc4", + "element_id": "bda050585a00f0f6cb502350559d7553", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Nuclear" + "text": "—" }, { "type": "UncategorizedText", - "element_id": "f6e7c5652bc0607d83b62e5075496c9a", + "element_id": "380918b946a526640a40df5dced65167", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "15,000" + "text": "=" }, { "type": "UncategorizedText", - "element_id": "17c583e364eb53f4713ee1b11c10eae5", + "element_id": "bda050585a00f0f6cb502350559d7553", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Gas" + "text": "—" }, { "type": "UncategorizedText", - "element_id": "ebf087fd620248057b4b331f6ff855ce", + "element_id": "bda050585a00f0f6cb502350559d7553", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "10,000" + "text": "—" }, { "type": "UncategorizedText", - "element_id": "79615d00dc2ebc2416ace53ae6e85050", + "element_id": "9911f4d2b18457c4726664d309385072", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Oil" + "text": "__" }, { "type": "UncategorizedText", - "element_id": "47af55050b9d14f32f04c4a2745263ae", + "element_id": "1d8fa3c8ab49d50b30fccbbd901735d5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "5,000" + "text": "2000" }, { "type": "UncategorizedText", - "element_id": "6ffe26de44efed734882cd46e5aec24e", + "element_id": "9aa2c4daec913f415a0d0e1cde7b9c2f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " Coal" + "text": "2010" }, { "type": "UncategorizedText", - "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", + "element_id": "6961729a2feb7f501ade1082297c00ff", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0" + "text": "2020" }, { "type": "UncategorizedText", - "element_id": "1d8fa3c8ab49d50b30fccbbd901735d5", + "element_id": "c5aeba7f7b48e1efc2d73205a4e08c18", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2000" + "text": "2030" }, { "type": "UncategorizedText", - "element_id": "9aa2c4daec913f415a0d0e1cde7b9c2f", + "element_id": "1633715e5907ba95a1e67918b4489b36", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2010" + "text": "2040" }, { "type": "UncategorizedText", - "element_id": "6961729a2feb7f501ade1082297c00ff", + "element_id": "8635a2005fe7dc2491c5d0f888d863ca", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2020" + "text": " CSP" }, { "type": "UncategorizedText", - "element_id": "c5aeba7f7b48e1efc2d73205a4e08c18", + "element_id": "3c01e983d1c0b6e1f7e5009554c15bfc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2030" + "text": " Solar PV" }, { "type": "UncategorizedText", - "element_id": "1633715e5907ba95a1e67918b4489b36", + "element_id": "24fda359cfaadc2b4765e4d96cc580bd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2040" + "text": " Geothermal" + }, + { + "type": "UncategorizedText", + "element_id": "b4686d8980288cc18e25dd88862018fb", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Wind" + }, + { + "type": "UncategorizedText", + "element_id": "6a5e5dc7ebe4536d6c4a8358ceae6b52", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Bioenergy" + }, + { + "type": "UncategorizedText", + "element_id": "f3869bfc418f6b38b211f34af7bbbee9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Hydro" + }, + { + "type": "UncategorizedText", + "element_id": "b1dad634c86b14162ce382a54d48adc4", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Nuclear" + }, + { + "type": "UncategorizedText", + "element_id": "17c583e364eb53f4713ee1b11c10eae5", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Gas" + }, + { + "type": "UncategorizedText", + "element_id": "79615d00dc2ebc2416ace53ae6e85050", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Oil" + }, + { + "type": "UncategorizedText", + "element_id": "6ffe26de44efed734882cd46e5aec24e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " Coal" }, { "type": "NarrativeText", @@ -461,23 +541,23 @@ }, { "type": "UncategorizedText", - "element_id": "1d4566e4a45f0c4aba925b605c5c29a9", + "element_id": "f6c1b0e99bd5f8d7ca3c879d69bca871", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000,000" + "text": "h W G" }, { "type": "UncategorizedText", - "element_id": "57e953601d50f87b811dd3a1176459b8", + "element_id": "1d4566e4a45f0c4aba925b605c5c29a9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": " High-carbon  Low-carbon" + "text": "30,000,000" }, { "type": "UncategorizedText", @@ -499,16 +579,6 @@ }, "text": "20,000,000" }, - { - "type": "UncategorizedText", - "element_id": "f6c1b0e99bd5f8d7ca3c879d69bca871", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "h W G" - }, { "type": "UncategorizedText", "element_id": "565954b7c6b77d26b0456d4ed8086bae", @@ -549,6 +619,16 @@ }, "text": "0" }, + { + "type": "UncategorizedText", + "element_id": "57e953601d50f87b811dd3a1176459b8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": " High-carbon  Low-carbon" + }, { "type": "UncategorizedText", "element_id": "2ab513b1608dd3d214980f32cdf095ca", @@ -869,6 +949,16 @@ }, "text": "Offshore Wind" }, + { + "type": "UncategorizedText", + "element_id": "0e6fac6a3ad129a64c2b9d6eaf6680e4", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "SS" + }, { "type": "UncategorizedText", "element_id": "6eb2aff0d17ccc5c242423c49cd5b462", @@ -921,23 +1011,23 @@ }, { "type": "NarrativeText", - "element_id": "13ff2375260e277c2dfbc8826aa50a65", + "element_id": "4b3dad9b769c100e89b2c082e7d9e13e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life." + "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation." }, { "type": "NarrativeText", - "element_id": "4b3dad9b769c100e89b2c082e7d9e13e", + "element_id": "13ff2375260e277c2dfbc8826aa50a65", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation." + "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life." }, { "type": "NarrativeText", @@ -979,6 +1069,16 @@ }, "text": "140" }, + { + "type": "UncategorizedText", + "element_id": "fb67f6e44a2659caaa0e28f08280eb3c", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "a t a F" + }, { "type": "UncategorizedText", "element_id": "e98b5c1980fd411c375277e496c5bf24", @@ -1011,13 +1111,23 @@ }, { "type": "UncategorizedText", - "element_id": "fb67f6e44a2659caaa0e28f08280eb3c", + "element_id": "380918b946a526640a40df5dced65167", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "a t a F" + "text": "=" + }, + { + "type": "UncategorizedText", + "element_id": "87d56fd00875b76bf2209a535fa9167a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "“99 :" }, { "type": "UncategorizedText", @@ -1051,13 +1161,13 @@ }, { "type": "UncategorizedText", - "element_id": "95aebc97bc646c67fdcd923a5965b001", + "element_id": "673650f936cb3b0a2f93ce09d81be107", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "80" + "text": "40" }, { "type": "UncategorizedText", @@ -1071,23 +1181,23 @@ }, { "type": "UncategorizedText", - "element_id": "673650f936cb3b0a2f93ce09d81be107", + "element_id": "5378796307535df3ec8d8b15a2e2dc56", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "40" + "text": "20" }, { "type": "UncategorizedText", - "element_id": "5378796307535df3ec8d8b15a2e2dc56", + "element_id": "95aebc97bc646c67fdcd923a5965b001", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "20" + "text": "80" }, { "type": "UncategorizedText", @@ -1101,13 +1211,23 @@ }, { "type": "UncategorizedText", - "element_id": "97b912eb4a61df5f806ca6239dde3e1a", + "element_id": "fc45e9339e2e3d1124113113be364f44", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "120" + "text": ":" + }, + { + "type": "UncategorizedText", + "element_id": "a365e757e54096274d39b6cd5c4032a9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "1 :" }, { "type": "UncategorizedText", @@ -1119,6 +1239,16 @@ }, "text": "C oal" }, + { + "type": "UncategorizedText", + "element_id": "97b912eb4a61df5f806ca6239dde3e1a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "120" + }, { "type": "UncategorizedText", "element_id": "c0971c6577849aec2c2952e231a7df2d", @@ -1139,6 +1269,16 @@ }, "text": "Oil" }, + { + "type": "UncategorizedText", + "element_id": "e942714ca8bde7c2efa0a45180d3bcb0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "N atural gas" + }, { "type": "UncategorizedText", "element_id": "325eaeb55b0a5f734132cd4762be703b", @@ -1151,43 +1291,73 @@ }, { "type": "UncategorizedText", - "element_id": "e942714ca8bde7c2efa0a45180d3bcb0", + "element_id": "593cbe414f10662e62c0da03ce3302b8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "N atural gas" + "text": "fe)" }, { "type": "UncategorizedText", - "element_id": "fcad23b33ff09a15b2bb9173b83240b6", + "element_id": "bccc48f4114eeff10d7a7172184e9e20", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "8.5" + "text": "Offshore wind" }, { "type": "UncategorizedText", - "element_id": "593ee7fbbc541c411bbecfec38b4592a", + "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "1.78" + "text": ":" }, { "type": "UncategorizedText", - "element_id": "bccc48f4114eeff10d7a7172184e9e20", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Offshore wind" + "text": "1" + }, + { + "type": "UncategorizedText", + "element_id": "1b16b1df538ba12dc3f97edbb85caa70", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "n" + }, + { + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "1" + }, + { + "type": "UncategorizedText", + "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "." }, { "type": "UncategorizedText", @@ -1199,6 +1369,16 @@ }, "text": "(U K)" }, + { + "type": "UncategorizedText", + "element_id": "fcad23b33ff09a15b2bb9173b83240b6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "8.5" + }, { "type": "UncategorizedText", "element_id": "015245e79f7a8e3c76aaa007096e495e", @@ -1211,13 +1391,13 @@ }, { "type": "UncategorizedText", - "element_id": "87bab34064761c6eac995b7feca9a87c", + "element_id": "593ee7fbbc541c411bbecfec38b4592a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "0.245" + "text": "1.78" }, { "type": "UncategorizedText", @@ -1231,13 +1411,13 @@ }, { "type": "UncategorizedText", - "element_id": "93d485bc75a984e0b6600c7735eb7e91", + "element_id": "87bab34064761c6eac995b7feca9a87c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "<0.01" + "text": "0.245" }, { "type": "UncategorizedText", @@ -1249,6 +1429,16 @@ }, "text": "N uclear*" }, + { + "type": "UncategorizedText", + "element_id": "93d485bc75a984e0b6600c7735eb7e91", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8 + }, + "text": "<0.01" + }, { "type": "NarrativeText", "element_id": "445676822969fb5177c0081d07449a70", @@ -1529,6 +1719,16 @@ }, "text": "500" }, + { + "type": "UncategorizedText", + "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "i" + }, { "type": "UncategorizedText", "element_id": "ef45d070844e892dd7274e2b58d343ea", @@ -1559,6 +1759,16 @@ }, "text": "400" }, + { + "type": "UncategorizedText", + "element_id": "30b160442c1de4494644bbb253d47d62", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "z=" + }, { "type": "UncategorizedText", "element_id": "b1dad634c86b14162ce382a54d48adc4", @@ -1591,33 +1801,63 @@ }, { "type": "UncategorizedText", - "element_id": "a9841ec903dc8bcd39950abd149df9dd", + "element_id": "c11e3f4837efde2441e23a7b9da02131", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": " Natural gas" + "text": "200" }, { "type": "UncategorizedText", - "element_id": "f3869bfc418f6b38b211f34af7bbbee9", + "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": " Hydro" + "text": "y ——" }, { "type": "UncategorizedText", - "element_id": "c11e3f4837efde2441e23a7b9da02131", + "element_id": "53bd2f6768df30b079584dd84720693b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "200" + "text": "-—" + }, + { + "type": "UncategorizedText", + "element_id": "53bd2f6768df30b079584dd84720693b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": "-—" + }, + { + "type": "UncategorizedText", + "element_id": "a9841ec903dc8bcd39950abd149df9dd", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": " Natural gas" + }, + { + "type": "UncategorizedText", + "element_id": "f3869bfc418f6b38b211f34af7bbbee9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 9 + }, + "text": " Hydro" }, { "type": "UncategorizedText", @@ -1829,6 +2069,16 @@ }, "text": "iv" }, + { + "type": "UncategorizedText", + "element_id": "d3fc2842ddfad4c8d3859f84d4439bfd", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 10 + }, + "text": "Vv" + }, { "type": "UncategorizedText", "element_id": "24264aa519b9e325bd189fe67fa3ff8d", @@ -1839,6 +2089,16 @@ }, "text": "v" }, + { + "type": "UncategorizedText", + "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 10 + }, + "text": "vi" + }, { "type": "UncategorizedText", "element_id": "c659791e303eb22c5c5d56ecc0b87608", @@ -1941,32 +2201,32 @@ }, { "type": "UncategorizedText", - "element_id": "821daa4396c0087d9d5ee9240bc5c85c", + "element_id": "de49f1c955d7c8a4d1d6d261c1cf21ba", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" + "text": "The Silent Giant © 2019 World Nuclear Association. Registered in England and Wales, company number 01215741" }, { "type": "UncategorizedText", - "element_id": "705da4db5e220010ddfd03d9452855e4", + "element_id": "821daa4396c0087d9d5ee9240bc5c85c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." + "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" }, { "type": "UncategorizedText", - "element_id": "de49f1c955d7c8a4d1d6d261c1cf21ba", + "element_id": "705da4db5e220010ddfd03d9452855e4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "The Silent Giant © 2019 World Nuclear Association. Registered in England and Wales, company number 01215741" + "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 166ef001d4..188ab7d065 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -1,4 +1,14 @@ [ + { + "type": "UncategorizedText", + "element_id": "5d5ae4682bf03035ba590610ca47335d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "WORLD ASSOCIATION" + }, { "type": "UncategorizedText", "element_id": "d72f07e2c764ae90417305db928ebce1", @@ -119,6 +129,16 @@ }, "text": "Perceived versus actual risk" }, + { + "type": "NarrativeText", + "element_id": "3cf0a9c5ad0cacc724f90abbe99664d9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span." + }, { "type": "NarrativeText", "element_id": "ce5bcf6b4fe24d62bd24d156d5bc965e", @@ -151,13 +171,13 @@ }, { "type": "UncategorizedText", - "element_id": "f4ccd05b3271c386ee55d9876c745001", + "element_id": "54183f4323f377b737433a1e98229ead", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "30" + "text": "17" }, { "type": "UncategorizedText", @@ -171,13 +191,13 @@ }, { "type": "UncategorizedText", - "element_id": "54183f4323f377b737433a1e98229ead", + "element_id": "f4ccd05b3271c386ee55d9876c745001", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "17" + "text": "30" }, { "type": "UncategorizedText", @@ -211,33 +231,33 @@ }, { "type": "UncategorizedText", - "element_id": "4355a46b19d348dc2f57c046f8ef63d4", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "1" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "3" }, { "type": "UncategorizedText", - "element_id": "1121cfccd5913f0a63fec40a6ffd44ea", + "element_id": "4355a46b19d348dc2f57c046f8ef63d4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "3" + "text": "1" }, { "type": "UncategorizedText", @@ -271,53 +291,53 @@ }, { "type": "UncategorizedText", - "element_id": "8afd0b048a86acf7393f96db0574c8ff", + "element_id": "86c586f8129cc34e163eedd76b538fe8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Smoking" + "text": "Vaccinations" }, { "type": "UncategorizedText", - "element_id": "85d8899695f08e1eb36c352ad90fd171", + "element_id": "8afd0b048a86acf7393f96db0574c8ff", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Handguns" + "text": "Smoking" }, { "type": "UncategorizedText", - "element_id": "ec8471bf2a3848a2b19b49d54c316f21", + "element_id": "85d8899695f08e1eb36c352ad90fd171", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Motor vehicles" + "text": "Handguns" }, { "type": "UncategorizedText", - "element_id": "c27f9f502d68bddce24f18d230fddc23", + "element_id": "ec8471bf2a3848a2b19b49d54c316f21", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Nuclear power" + "text": "Motor vehicles" }, { "type": "UncategorizedText", - "element_id": "86c586f8129cc34e163eedd76b538fe8", + "element_id": "c27f9f502d68bddce24f18d230fddc23", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Vaccinations" + "text": "Nuclear power" }, { "type": "UncategorizedText", @@ -349,16 +369,6 @@ }, "text": "25" }, - { - "type": "UncategorizedText", - "element_id": "4deedd21c16662f9a68c10e5f0dc0c3b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, { "type": "UncategorizedText", "element_id": "fe6688066e4058f4d28c3956c1545589", @@ -381,13 +391,13 @@ }, { "type": "UncategorizedText", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "4deedd21c16662f9a68c10e5f0dc0c3b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "" }, { "type": "UncategorizedText", @@ -430,14 +440,14 @@ "text": "7" }, { - "type": "NarrativeText", - "element_id": "3cf0a9c5ad0cacc724f90abbe99664d9", + "type": "UncategorizedText", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span." + "text": "2" }, { "type": "NarrativeText", @@ -471,7 +481,7 @@ }, { "type": "UncategorizedText", - "element_id": "e1cccd4114d41b7b658b1261e800a2da", + "element_id": "fd4bd13eed12d06bdaeb4ed41fbc3900", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -481,13 +491,13 @@ }, { "type": "UncategorizedText", - "element_id": "e209bc5e73dbdae2a845e0431c84967e", + "element_id": "e1cccd4114d41b7b658b1261e800a2da", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "Artificial" + "text": "Natural" }, { "type": "UncategorizedText", @@ -499,6 +509,16 @@ }, "text": " 48% Radon  14% Buildings & soil  12% Food & water  10% Cosmic  4% Thoron" }, + { + "type": "UncategorizedText", + "element_id": "e209bc5e73dbdae2a845e0431c84967e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "Artificial" + }, { "type": "UncategorizedText", "element_id": "803f8dea6f711596197d2094c3efa4a0", @@ -599,6 +619,16 @@ }, "text": "r a e y" }, + { + "type": "UncategorizedText", + "element_id": "14321f6061d8f50898383dade4ec7241", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "S15" + }, { "type": "UncategorizedText", "element_id": "a2bbdb2de53523b8099b37013f251546", @@ -671,33 +701,33 @@ }, { "type": "UncategorizedText", - "element_id": "8048d1e7e392843de42064d4c1ae5988", + "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "4.6" + "text": "0" }, { "type": "UncategorizedText", - "element_id": "df5b47311873a4388b15a71ae9846e61", + "element_id": "8048d1e7e392843de42064d4c1ae5988", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "2.8" + "text": "4.6" }, { "type": "UncategorizedText", - "element_id": "9a271f2a916b0b6ee6cecb2426f0b320", + "element_id": "02a238b618a70acd5d23e30a37662ca6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "0" + "text": "|| es" }, { "type": "UncategorizedText", @@ -741,13 +771,13 @@ }, { "type": "UncategorizedText", - "element_id": "7bf1f073715e4adc8472523b0d32043f", + "element_id": "df5b47311873a4388b15a71ae9846e61", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "0.07" + "text": "2.8" }, { "type": "UncategorizedText", @@ -761,13 +791,13 @@ }, { "type": "UncategorizedText", - "element_id": "95d020e1024da4b79092a4cd784eb0e7", + "element_id": "7bf1f073715e4adc8472523b0d32043f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "0.04" + "text": "0.07" }, { "type": "UncategorizedText", @@ -781,13 +811,13 @@ }, { "type": "UncategorizedText", - "element_id": "ad1d34e4d341cc6a4c4afadc494bd5c4", + "element_id": "95d020e1024da4b79092a4cd784eb0e7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "0.02" + "text": "0.04" }, { "type": "UncategorizedText", @@ -801,13 +831,13 @@ }, { "type": "UncategorizedText", - "element_id": "a37f9fe7bd3fa9d73c6b5c339b56943e", + "element_id": "ad1d34e4d341cc6a4c4afadc494bd5c4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "0.01" + "text": "0.02" }, { "type": "UncategorizedText", @@ -819,6 +849,16 @@ }, "text": "N uclear" }, + { + "type": "UncategorizedText", + "element_id": "a37f9fe7bd3fa9d73c6b5c339b56943e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5 + }, + "text": "0.01" + }, { "type": "UncategorizedText", "element_id": "8e44807922e69a38594c4b389cd0be54", @@ -941,13 +981,13 @@ }, { "type": "NarrativeText", - "element_id": "00548dbd288df8370c39789adb302f50", + "element_id": "ba80f89ec0449fefee24b33fbb7e29b6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Contemporary debates around nuclear energy often reflect the precautionary principle, a problematic concept applied across a range of regulatory and policy issues. A ‘strong’ interpretation of the precautionary principle, or a ‘as low as possible’ approach to risk, dictates that regulation is required whenever there is a potential adverse health risk, even if the evidence is not certain and regardless of the cost of regulation." + "text": "However, the regulatory process and the policy debate around nuclear more broadly has long departed from the ALARA principle, no longer weighing cost versus benefits, or considering the overall advantages of nuclear energy, but rather looking at radiation in isolation. This has resulted in a subtle shift towards an ‘as low as possible’ mentality. Attempting to reduce radiation far below de facto safe levels has resulted in an escalation of costs and loss of public confidence, and in some cases has deprived communities of the many benefits nuclear energy provides. In practical terms, this has led to the continued use of more harmful energy sources, such as fossil fuels." }, { "type": "NarrativeText", @@ -961,13 +1001,13 @@ }, { "type": "NarrativeText", - "element_id": "ba80f89ec0449fefee24b33fbb7e29b6", + "element_id": "00548dbd288df8370c39789adb302f50", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "However, the regulatory process and the policy debate around nuclear more broadly has long departed from the ALARA principle, no longer weighing cost versus benefits, or considering the overall advantages of nuclear energy, but rather looking at radiation in isolation. This has resulted in a subtle shift towards an ‘as low as possible’ mentality. Attempting to reduce radiation far below de facto safe levels has resulted in an escalation of costs and loss of public confidence, and in some cases has deprived communities of the many benefits nuclear energy provides. In practical terms, this has led to the continued use of more harmful energy sources, such as fossil fuels." + "text": "Contemporary debates around nuclear energy often reflect the precautionary principle, a problematic concept applied across a range of regulatory and policy issues. A ‘strong’ interpretation of the precautionary principle, or a ‘as low as possible’ approach to risk, dictates that regulation is required whenever there is a potential adverse health risk, even if the evidence is not certain and regardless of the cost of regulation." }, { "type": "NarrativeText", @@ -979,6 +1019,16 @@ }, "text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs – be they economic, environmental, or public health – associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power." }, + { + "type": "UncategorizedText", + "element_id": "921f9c7e9e48ae6e68c4a600c65cc6d9", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "ae) flea" + }, { "type": "UncategorizedText", "element_id": "f5076aa15e05d1b399fcee15da62ea07", @@ -1101,33 +1151,33 @@ }, { "type": "NarrativeText", - "element_id": "d754d8d468346f652657279272a11897", + "element_id": "0714f9ff88637006bdb76908c7c936bf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "Clearly, we have reached a point where we must establish a new conversation about the relative risks of using nuclear, especially when risks created by other energy sources are considered. We cannot address many of the global challenges we face without a significant increase in the use of nuclear energy. The detrimental effects of decades of looking at nuclear risks in isolation highlights just how crucial it is that regulators and policymakers change the way they view nuclear energy, and transition towards an all-hazards approach, ensuring that actions taken to mitigate risks do not result in creating more severe risks." + "text": "We must begin to holistically look at the severity of the consequences of maintaining the current energy production system, many of which are irreversible. The ways in which we address climate change and other issues of global importance must be sustainable and not create new hazards down the line. The reality is that nuclear has always been and remains an exceptionally safe source of energy, representing the lowest risk, the most sustainable, and the most affordable ways to generate around-the-clock electricity." }, { "type": "NarrativeText", - "element_id": "0714f9ff88637006bdb76908c7c936bf", + "element_id": "f62c49fcf0a7960d0b509e37507d76d3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "We must begin to holistically look at the severity of the consequences of maintaining the current energy production system, many of which are irreversible. The ways in which we address climate change and other issues of global importance must be sustainable and not create new hazards down the line. The reality is that nuclear has always been and remains an exceptionally safe source of energy, representing the lowest risk, the most sustainable, and the most affordable ways to generate around-the-clock electricity." + "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." }, { "type": "NarrativeText", - "element_id": "f62c49fcf0a7960d0b509e37507d76d3", + "element_id": "d754d8d468346f652657279272a11897", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." + "text": "Clearly, we have reached a point where we must establish a new conversation about the relative risks of using nuclear, especially when risks created by other energy sources are considered. We cannot address many of the global challenges we face without a significant increase in the use of nuclear energy. The detrimental effects of decades of looking at nuclear risks in isolation highlights just how crucial it is that regulators and policymakers change the way they view nuclear energy, and transition towards an all-hazards approach, ensuring that actions taken to mitigate risks do not result in creating more severe risks." }, { "type": "FigureCaption", @@ -1171,233 +1221,233 @@ }, { "type": "UncategorizedText", - "element_id": "c06ac75f019ceac1ff2baecfc090fd3e", + "element_id": "40bf1390db138ee3d2e9fe0a804aba7a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" + "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" }, { "type": "UncategorizedText", - "element_id": "3835a774ed6bc25ea5d9285964eafab5", + "element_id": "d84737e149e0a38cf197d5ae84dffe91", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "ii" + "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" }, { "type": "UncategorizedText", - "element_id": "199440a0821e16b612f4697aa2306cb2", + "element_id": "004bd6855483673c349aba9a4e2d3a73", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" + "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" }, { "type": "UncategorizedText", - "element_id": "933bb5199e0009b0cf5982dc70990cd5", + "element_id": "b187d8373b3bbccc14961aea391887c6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "iii" + "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" }, { "type": "UncategorizedText", - "element_id": "18b2cdcbf43cbcab942c6ffa69abdc51", + "element_id": "52e3a3042d860ebd51bd30e03a3d370f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." + "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" }, { "type": "UncategorizedText", - "element_id": "c3eba57eb044498e085ee68272270bbb", + "element_id": "af64bcc9f6d36d2c339a592dc2ae75ff", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" + "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." }, { "type": "UncategorizedText", - "element_id": "05c20e21a39fa28bae9b00ea9006f587", + "element_id": "24264aa519b9e325bd189fe67fa3ff8d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" + "text": "v" }, { "type": "UncategorizedText", - "element_id": "24264aa519b9e325bd189fe67fa3ff8d", + "element_id": "c659791e303eb22c5c5d56ecc0b87608", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "v" + "text": "vi" }, { "type": "UncategorizedText", - "element_id": "81be06e67a1b533cb1278b15860c51db", + "element_id": "c3eba57eb044498e085ee68272270bbb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" + "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" }, { "type": "UncategorizedText", - "element_id": "c659791e303eb22c5c5d56ecc0b87608", + "element_id": "933bb5199e0009b0cf5982dc70990cd5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "vi" + "text": "iii" }, { "type": "UncategorizedText", - "element_id": "69bd2cd5a46ac8850a9e3ea2df80de60", + "element_id": "3835a774ed6bc25ea5d9285964eafab5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" + "text": "ii" }, { "type": "UncategorizedText", - "element_id": "52e3a3042d860ebd51bd30e03a3d370f", + "element_id": "10407d498f2636f50597e71d97cc001a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" + "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." }, { "type": "UncategorizedText", - "element_id": "6d22db6c4525ddda0474a31ce3ed67b8", + "element_id": "81be06e67a1b533cb1278b15860c51db", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" + "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" }, { "type": "UncategorizedText", - "element_id": "b187d8373b3bbccc14961aea391887c6", + "element_id": "69bd2cd5a46ac8850a9e3ea2df80de60", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" + "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" }, { "type": "UncategorizedText", - "element_id": "e140259209e77d2f8c213fffa67e8a30", + "element_id": "18b2cdcbf43cbcab942c6ffa69abdc51", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "understanding/statistics" + "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." }, { "type": "UncategorizedText", - "element_id": "004bd6855483673c349aba9a4e2d3a73", + "element_id": "6d22db6c4525ddda0474a31ce3ed67b8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" + "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" }, { "type": "UncategorizedText", - "element_id": "dd72b61820fe1a241b5a4b8e91370a77", + "element_id": "199440a0821e16b612f4697aa2306cb2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "professional/cancer-statistics/risk" + "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" }, { "type": "UncategorizedText", - "element_id": "d84737e149e0a38cf197d5ae84dffe91", + "element_id": "e140259209e77d2f8c213fffa67e8a30", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" + "text": "understanding/statistics" }, { "type": "UncategorizedText", - "element_id": "9205539115ff601cff51c3bb98a21400", + "element_id": "dd72b61820fe1a241b5a4b8e91370a77", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "the-full-costs-of-electricity-provision?details=true" + "text": "professional/cancer-statistics/risk" }, { "type": "UncategorizedText", - "element_id": "40bf1390db138ee3d2e9fe0a804aba7a", + "element_id": "9205539115ff601cff51c3bb98a21400", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" + "text": "the-full-costs-of-electricity-provision?details=true" }, { "type": "UncategorizedText", - "element_id": "05612b96094a7aa4d2aaff0c45388352", + "element_id": "c06ac75f019ceac1ff2baecfc090fd3e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "sheets/detail/climate-change-and-health" + "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" }, { "type": "UncategorizedText", - "element_id": "af64bcc9f6d36d2c339a592dc2ae75ff", + "element_id": "05612b96094a7aa4d2aaff0c45388352", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." + "text": "sheets/detail/climate-change-and-health" }, { "type": "UncategorizedText", - "element_id": "10407d498f2636f50597e71d97cc001a", + "element_id": "05c20e21a39fa28bae9b00ea9006f587", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." + "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" }, { "type": "UncategorizedText", @@ -1421,32 +1471,32 @@ }, { "type": "UncategorizedText", - "element_id": "821daa4396c0087d9d5ee9240bc5c85c", + "element_id": "fc5faebaec5a1349ce932f1863bdd842", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" + "text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741" }, { "type": "UncategorizedText", - "element_id": "705da4db5e220010ddfd03d9452855e4", + "element_id": "821daa4396c0087d9d5ee9240bc5c85c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." + "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" }, { "type": "UncategorizedText", - "element_id": "fc5faebaec5a1349ce932f1863bdd842", + "element_id": "705da4db5e220010ddfd03d9452855e4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741" + "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b009faa948..5190010880 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.11-dev0" # pragma: no cover +__version__ = "0.10.11" # pragma: no cover