From 0a65fc21340cdef10ddd7c96f8d66bdf82b88e4a Mon Sep 17 00:00:00 2001 From: Klaijan Date: Wed, 4 Oct 2023 13:30:23 -0400 Subject: [PATCH 1/2] feat: xlsx subtable extraction (#1585) **Executive Summary** Unstructured is now able to capture subtables, along with other text element types within the `.xlsx` sheet. **Technical Details** - The function now reads the excel *without* header as default - Leverages the connected components search to find subtables within the sheet. This search is based on dfs search - It also handle the overlapping table or text cases - Row with only single cell of data is considered not a table, and therefore passed on the determine the element type as text - In connected elements, it is possible to have table title, header, or footer. We run the count for the first non-single empty rows from top and bottom to determine those text **Result** This table now reads as: image ``` [ { "type": "Title", "element_id": "3315afd97f7f2ebcd450e7c939878429", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Quarterly revenueNine quarters to 30 June 20231
Group financial performanceFY 22FY 232
Segmental resultsFY 22FY 233
Segmental analysisFY 22FY 234
Cash flowFY 22FY 235
" }, "text": "Financial performance" }, { "type": "Table", "element_id": "17f5d512705be6f8812e5dbb801ba727", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "3315afd97f7f2ebcd450e7c939878429", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Quarterly revenueNine quarters to 30 June 20231
Group financial performanceFY 22FY 232
Segmental resultsFY 22FY 233
Segmental analysisFY 22FY 234
Cash flowFY 22FY 235
" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nQuarterly revenue\nNine quarters to 30 June 2023\n\n\n1\n\n\nGroup financial performance\nFY 22\nFY 23\n\n2\n\n\nSegmental results\nFY 22\nFY 23\n\n3\n\n\nSegmental analysis\nFY 22\nFY 23\n\n4\n\n\nCash flow\nFY 22\nFY 23\n\n5\n\n\n" }, { "type": "Title", "element_id": "8a9db7161a02b427f8fda883656036e1", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Mobile customersNine quarters to 30 June 20236
Fixed broadband customersNine quarters to 30 June 20237
Marketable homes passedNine quarters to 30 June 20238
TV customersNine quarters to 30 June 20239
Converged customersNine quarters to 30 June 202310
Mobile churnNine quarters to 30 June 202311
Mobile data usageNine quarters to 30 June 202312
Mobile ARPUNine quarters to 30 June 202313
" }, "text": "Operational metrics" }, { "type": "Table", "element_id": "d5d16f7bf9c7950cd45fae06e12e5847", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "8a9db7161a02b427f8fda883656036e1", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Mobile customersNine quarters to 30 June 20236
Fixed broadband customersNine quarters to 30 June 20237
Marketable homes passedNine quarters to 30 June 20238
TV customersNine quarters to 30 June 20239
Converged customersNine quarters to 30 June 202310
Mobile churnNine quarters to 30 June 202311
Mobile data usageNine quarters to 30 June 202312
Mobile ARPUNine quarters to 30 June 202313
" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nMobile customers\nNine quarters to 30 June 2023\n\n\n6\n\n\nFixed broadband customers\nNine quarters to 30 June 2023\n\n\n7\n\n\nMarketable homes passed\nNine quarters to 30 June 2023\n\n\n8\n\n\nTV customers\nNine quarters to 30 June 2023\n\n\n9\n\n\nConverged customers\nNine quarters to 30 June 2023\n\n\n10\n\n\nMobile churn\nNine quarters to 30 June 2023\n\n\n11\n\n\nMobile data usage\nNine quarters to 30 June 2023\n\n\n12\n\n\nMobile ARPU\nNine quarters to 30 June 2023\n\n\n13\n\n\n" }, { "type": "Title", "element_id": "f97e9da0e3b879f0a9df979ae260a5f7", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Average foreign exchange ratesNine quarters to 30 June 202314
Guidance ratesFY 23/2414
" }, "text": "Other" }, { "type": "Table", "element_id": "080e1a745a2a3f2df22b6a08d33d59bb", "metadata": { "filename": "vodafone.xlsx", "file_directory": "example-docs", "last_modified": "2023-10-03T17:51:34", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "parent_id": "f97e9da0e3b879f0a9df979ae260a5f7", "languages": [ "spa", "ita" ], "page_number": 1, "page_name": "Index", "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TopicPeriodPage
Average foreign exchange ratesNine quarters to 30 June 202314
Guidance ratesFY 23/2414
" }, "text": "\n\n\nTopic\nPeriod\n\n\nPage\n\n\nAverage foreign exchange rates\nNine quarters to 30 June 2023\n\n\n14\n\n\nGuidance rates\nFY 23/24\n\n\n14\n\n\n" } ] ``` --- CHANGELOG.md | 3 +- docs/requirements.txt | 4 +- .../2023-half-year-analyses-by-segment.xlsx | Bin 0 -> 38442 bytes example-docs/vodafone.xlsx | Bin 0 -> 12541 bytes requirements/base.txt | 4 +- requirements/build.txt | 4 +- requirements/dev.txt | 233 ++++-------- requirements/extra-paddleocr.txt | 10 +- requirements/extra-pdf-image.txt | 6 +- requirements/extra-pptx.txt | 2 +- requirements/huggingface.txt | 4 +- requirements/ingest-airtable.txt | 2 +- .../ingest-azure-cognitive-search.txt | 2 +- requirements/ingest-azure.txt | 4 +- requirements/ingest-box.txt | 2 +- requirements/ingest-confluence.txt | 2 +- requirements/ingest-discord.txt | 2 +- requirements/ingest-dropbox.txt | 2 +- requirements/ingest-gcs.txt | 2 +- requirements/ingest-github.txt | 15 +- requirements/ingest-gitlab.txt | 2 +- requirements/ingest-google-drive.txt | 2 +- requirements/ingest-jira.txt | 2 +- requirements/ingest-onedrive.txt | 4 +- requirements/ingest-openai.txt | 6 +- requirements/ingest-outlook.txt | 4 +- requirements/ingest-reddit.txt | 2 +- requirements/ingest-s3.txt | 2 +- requirements/ingest-salesforce.txt | 2 +- requirements/ingest-sharepoint.txt | 4 +- requirements/ingest-wikipedia.txt | 2 +- requirements/test.txt | 6 +- test_unstructured/partition/test_auto.py | 49 ++- test_unstructured/partition/test_constants.py | 1 + test_unstructured/partition/xlsx/test_xlsx.py | 98 +++-- .../Shared Documents/stanley-cups.xlsx.json | 68 +++- .../gcs/nested-2/stanley-cups.xlsx.json | 66 +++- .../tests-example.xls.json | 355 +++++++++++++++++- unstructured/partition/xlsx.py | 350 ++++++++++++++++- 39 files changed, 1030 insertions(+), 298 deletions(-) create mode 100644 example-docs/2023-half-year-analyses-by-segment.xlsx create mode 100644 example-docs/vodafone.xlsx diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a1f7b1110..b9ee29ccd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* **Adds XLSX document level language detection** Enhancing on top of language detection functionality in previous release, we now support language detection within `.xlsx` file type at Element level. * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. @@ -10,7 +11,7 @@ * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio). * **change default `hi_res` model for pdf/image partition to `yolox`** Now partitioning pdf/image using `hi_res` strategy utilizes `yolox_quantized` model isntead of `detectron2_onnx` model. This new default model has better recall for tables and produces more detailed categories for elements. -### Features +* **XLSX can now reads subtables within one sheet** Problem: Many .xlsx files are not created to be read as one full table per sheet. There are subtables, text and header along with more informations to extract from each sheet. Feature: This `partition_xlsx` now can reads subtable(s) within one .xlsx sheet, along with extracting other title and narrative texts. Importance: This enhance the power of .xlsx reading to not only one table per sheet, allowing user to capture more data tables from the file, if exists. ### Fixes diff --git a/docs/requirements.txt b/docs/requirements.txt index 99386eb5a7..2373ecfb3e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -18,7 +18,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # -r requirements/build.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -54,7 +54,7 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==2.0.0 # via -r requirements/build.in -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # sphinx diff --git a/example-docs/2023-half-year-analyses-by-segment.xlsx b/example-docs/2023-half-year-analyses-by-segment.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d0ce5e673c7c26f1239eafe4af50aca0a4af9b96 GIT binary patch literal 38442 zcmeFYbDJz*mn~YhZQI^u+qP}nRl984wr$(yF59)sy!Cs#``-8T>GKKBjXzdC`Q(gT zk&$b}7<0_IXFzvvX;@X_r>YX2ZNpehb-9U`qlM;ww1w5|azG6*l`@FnV<5#D6cgK)G! z&eJoAV@E=3_5{{UQ0W<}6H{YtfFWisLaN7_H1(?9wkdA-8*9)yX!k z7Gf>nnZb`mRSSN=2v7C3=0{ao!d#cc3Qi**I%|Ym%XE$tG7%*?Ev~ zcVdM{V~OV-o+LQKMtQ~@w|hmPr&2-O(Sf;olUI-kK0z-n>IK4Zi!)kKHFSet-i2e18K2$p0@`N7n5z0s6uE z&JUEKf3U9SXkzU|PxsIG|6~0BVqpKz3n@{%G1z=w686<%*Od0vVWPy4a~YAobbS29B8M zC>nh!J`g};U2#$Q$Q(^|l)v`K8&S1i5JX4%?T_IiraBzR)2?m7uAc0Oq4^*QUqPy! ze1sdagC#%BGCU&=!Q}*8!28nm+Qs%urq*K8({8+&m^!XRcqUHcC2n2t{4Z;PYJM?E zf1U5dS=huokr6N+x#nl?>5bLAqM|5ia0YPp@#`f!Xj4~_h!e8kR=TM72}2hXe5txE z{dmDMV?B@2jN0rkq?tbrZ$Ee8e}~LK7=!>V7yv*n1^@v3PlUTQy_=n*m7$%T)j!nO zrE%d{$cFN@limX?#k0*fOaRj}$Byf5wII=eIl67lnQnj*XEY5|AV?zX_B8(1NWVhd z7n9A-=-Tqgt{wC;BO;btMGxLPRVd$1%w0@#T~TIR1{LmuHuus}`NfNAVWT1e8Uu{e zkmkG$o>0+?s?St!xLANfILk9eHhDEwC= z31A2ow%mP8K!x-^`(QK8!5dj{&B>VB32uk%gAf?*cmRpp)0C=!*NieVf|LIucyV@L zEF@-56Y6-@Ngj55fup_95hhjl?l?!6HC26DjKR1I6IQ_DsCl3N{z^S^E}^!9&+G=E9}KGjz<9#;`sny2~2~i zS?E+D^lPRW`UIpf{!&hHmYJ(bi`UP?!Aj(3!YJ}Os*&c@hn^86YM&b28#YRS_rrh}yMcR1c5?+ZFS7#y z!22`*8~5F-HLaK9QigylvLcq?u)3IrI6LpZAc;NRv zeG0{_S-HQSgV%}&Xw?K|nZF7LVP>7HxZJ>c(zPDc!kcMWc0xU-VJS`$0;sEV zEJt+k$-uFJkVaSN<7sp8;b&)tmXNHCGBdW8hGQm=5Kq#RuC4>xif<(=r(Fw}(O+J@ zJzez-RY4&;Hn2iSMfrrtlf5OWhRnEun+*MLmP-Il9#$@Y zRk>BGJXEEZ&|lWD8zTTN+>OCBvkhLrwX1?Pt1VNirK>ENWBj_GUv$)I_RNS>%vd18 zlI);jDhI+%*~LBKes#CGa^^jQx7s(z}aW%PRZ;WAATQ= zE1#7EXW#vGJ;EPW?Ow~RJ}dsqI_Q_1`ew(!eM#hA;o5$~G}1nM_@+Ri>7+zhw14{j zQMY|Mh4mE|Smgk>h#bz_7(-V+YFQhjUlUBU&X;HbFaU1fpwjZMLqznIiV?`(cf)8R zO7oY1B%dddp+L-84ErJ#VWLE(K>=^mt$?@hCgUM!1jW90Im$N?LVg16CZ>K@L;zaz z$rWi|TT&4Ki;g+yi~ij}yYpo`1)Uz5sEmWxhv`fQwIMQV`y3@Yx>SXTWr22%p;^1y z#Nx$Zxkk%!3V#*9UADz!{e8!A0N<0Xwv%eCw^36y~8>S4fd$4T0(Ee zodB3-$G7?blqR>Xwsytiq26%xYyIr7$#5reZ>SST)QQNS0*GEjqD*B za@n9t;9D!ItLJ5BV6XRoWR_qiT^WuaF$VO*E?58%fIrOgAA0QH`Q^W~8Q@R;^%MJl z_tBLkV>QSC6LRzWGj#WTzew*r&jKF%w{9z(D{D+b8kuX|lFjEcaxfV*$_UyL!KFF9gJdhW3i=VCgJ?VOJ-wM&I9>i1-5& zO&IkN!_vyaSBXhw)aEE7HK3DR!GEKP;Cv{xis36jIHVGiAN*K{O!F|W?xkOJW*(nQ zQ>edduObG_-5uW6{EMkcNmM6S#d+1N(b5;ok9Swz2`mb+m znf@SL5GPpidPsJzVvROTN(#+^bZbaoP7@tYAC#?>Y4WK}to_0AkTFmq#*SpF40aRa;PnW zfr?Ett>SV)!=^5_nIO$2d-Jq`*5fvptGIeNHX4Yl{m9P8!E56Br$U52{D!^C2??YT z692Sv|AmZfPA(p}8wo?o3MgttNLH6W;q%|;8&mM&))JsPKqSk=#EA`4ERhfuf^tX| zVAemB$tg?wdfnmDg#l>3i6{zKqlw~*x``G33i#!Dh)s#$cPtKQS1D4=7odMUp`bg+7T<0TwRk`;4$?RIg?-hM(TApsLV1p zV2U6=OLK1ZEgMAN&73GGA;F2p&7ur=+`|yR#xs7P=XrabQ(U0I@e2zLKfrk3=Z+Sz zHmgb~71lM0?*kHT^I)P{wJq|xi8u* z{(&|f=4Qss)7Ewe#SC?4=KE(EegDT$r8OS2#g6oeN-Vbs-kz|OBB_kZP#%mlAI-t? z1`IRUjnlzn@8Rj|M-h*Nd1GQZp~1WE8x`zL@*6S#;b8J6m)GZt*ZcGK#qj4W^EXSdpOy>4RC_42kKy^{`Z6DQ{*$()~qYe^EfnO94QNZt)-N>m^gkMQ;v87 zV^CC(wDBmIF(h(eUmZJ)R?% z1h?pz${6zw3hLILDpXlRNaRWYzwYlyUsUn7RL*qEzDhtq2sz?&bocl5Ql0!d;NT8K zkLBc#mp{0-ucfksI{U61RH?(co%p`G)AfPPO^W>Z)M07aDDu;gr{`(`L^;0YiP`## zFj464Uk2L!7Yy}M&{Q2!Y)7bV3E4ZHfV_w)Y5;5PWdw({{4U%Am*&O(3t|YT{QS6t zX%ChBo}fXiQO;2;83I9=2I{{j!*w=Lp?>kRI^a42*Gb#&z5ju|eQn0&WBc7;{P)G~t6m*v*>syDfqsgN2hx1ik zsXM?|?^yh8Pygp+s0V5pay=BT+rWCv3F{C zK2_g;`nqcyjh6zFwSVT^sw5%&>(P!gi=v%<{T=mu;V#*24|&ri@LT?2%s|pT@a$m@ z+Kz6#L)W6?oP?6~^J^RX$|t;d5lKoZk^y3Q6)WtDDYN)GcQf+(UhVeQg{1#X_BM`* zKFjn)6Uzdngga1*9deN~%aQ)GoJ4$Fh@MJDt_NE0IZ0z8)p6q!QA=~JM=Kah?5lQ` z@B4m{k8b+4;2a+h-!~hQo@!hFY?#x=$rZc#d}%uViDHVh<1C|bvt5Rio>JtV%Hwhw z=F;7KMyM%4A>CYkn8mEMv(m^d&)mtP#>mN4tEc=Fs^A=@i^Qq5YZp77#1qE9t7fW5aPA!?a-rnmeElTJYHIuEYa@}OV zVVA9#))bf~elER&|cF+vnOE!}0uhiWP366SW@ zkzBLs>4<@L)>0=m>0zE@oU2is^))0Jm>ZiLYPklt!B&b0ytK@rpBHS4UhdEOAi!zV zt8%1}C<KZ$yL0XL8i#cX(7AS18vm43hPOW%WrBDI> zd%XZfU1cR!FQi1kepqghJx(t~u!BHPuKI1b9hL?Fag6oh|*(p6=*MFU$a!<6!x$HAYsw$pd8KZ5Y-Yg-TR~e_x4`lzp6!{RB~XwF66Tz- zKdgsrLCjg9M+fs1+`;wJt^qlkSSGP~99QWIa*eh%fM`m#c-Cwd!Sc^BHk&%w^&Gn& zEhg`SdXe>uS18-NH#%xiQ;={koMlTRIaPumb$ZW9u3Xrd*_nk{5?NtwO6X@3Sv!bs zE}z(+ETKQHCwxUqnaMcGt!MsP(`q?1@2=y9oVL>$DU#G4{PND@xpGZ#)Vn_(j~!H{ zI1Zx~!WH5X0u~C%kIxS$q$ZFEEsNg%edVXn_5NFUby&-njg|-|1wet~YtDU9D?g_A z-yxg;_2K)9MddA{3Qfg;JNqkN98>^T>VLs4YJslzQTgrTJmr<_UZ#UXs>y4x=lyTp z|MoCQfKuq}PJq&z9a9omKPN+x0~OGbm7ksKi~ASG#-PL%2`kFg6!$8jJ*X(Gwt|RI zL0mWl{{713hJ`+E*ngb+|I$-a%A2-J33@z zWZ}!!IM(#EBHjo){jq!OCR)u>!^HF}?lC-0BTge0J1|5iKsH7c!m8G5a^Zjf%C5vaNH?0MDztgUwz`M_xG^1`uYBPr8Vy;DZe zcBio10VmRnlS~=xh^qUoxt8wWmNd1ssKSpj{+6^_YnOvGg**$Jw(Z#%)Wb1M@N`g39I-dkoFnuElnzJppeBCa&ERL={RVY z`9nFE*8?<@b1U=ge5T5+7WhkS7G-sfqs-0S-aSs%l~B8haC18{p6heGn*91@I^EeZ zRX;`c+T}HWl*$0OBCyy8Fxs)*=BkrfYh)%e#P{{?AsGJ|*UO&-P2!1YipZMYFu=!` zlgH)egFwujKFEt$1TEat*ikDiS*5mjRcFT{+H&*R$^v_b#wJuC;OcZ|=+_f+F5-Y8 z8Uj`W!IRxl8$31$Cmb?TNYY3W(rheutU*)aC9ggCemy=|j~qsLVHjA_y5^kh%0VZg zL?8Ff>=-xE1A80!gDh2H8^d>rQ|kWr+$#gYsSdTH1^}lPezs7vO&iuoa66#8#8{a7 z;mgEI8n!Z3yFh6dr5JtywKJU0qj<{LlD}>M+jMR2X?ns_=55#s4<}EmqR#2p?3C}n z7TJHV5jp30r8NIkk34YxkJ%y9KiOeXqK(}q0mA5x+=d%}S1f*FQ~18JR;g`#1*PWZ z6>I+q#xMr_m{O_FGcLO<(UHssAvO%J?{n`#+SZk^2wPsXyo>0;Pu0n?#!-Qu`R)E= zuCsVZUG=0&k=584KID%KpKqFVvTiz39}>wDSz&SWG&>9|y#Nz(8JOjV5(RDgaJgF82KhciM2RS(hEkOu3Mdnr`8}a zB&a-a&69*QKH6wf&7!4@@l1EkEbEGu3$oMXbg)X#Ez#f1R8a?*FsS<1OkI->j`C2#| zS{>JxBZt)*Kwe8Vt&t$SY|JaEt@hQghi|;Tyh{f{6?2z=uT! zyz%*#-?GC`BNdsk-$x!&33V`>Bfdw+w!*ym>aZ!8puo@;zOZHxwcSavh)C*fQlQ`x4%+_UY==#qHEv zcPo~vit9aO7>#&&)%5!l{a7{RKI1IQo5Q6+RCJD(A++58E zR$}BSwPhS`BA+VNE-4P!yVfv%s9*tRPVqu1OfXEw=;7x4f`bi{w9reaeWPf7He7QM z7neo=NAJWqY*%K1*0OyI0-#XrL3wAcE5_Ui8;+L;4bz#l^M;oXOd^s8%@DY1r@Z)`rS#ox>9_^^x4kAwS54SAq8~a{1a5|=b-yh;9{pPYMeU*NaCaKEPQgoqUJ%J4#<>&(i|pId z5d4N06~oilEsDX6mDJ>bgAgv-4t09)agY23PC2MHWD^O&;^_%AeKuS?DN#oJYv+;V5)uI1?oXo5@s);^iz2X)H{>1q=Juo3P z{5+FN2QISa?x1DgSSr`vD*%tSefTAN$i&5^emoPTu~6Rd9fW_wwzIDx5vFgZv&jtPjQ5pOT9v<0|cB%ma_~YDGzT*aNGy zUr&7#t2DD5dJZ$fnh*+#_uKAvC2D<$#^4yA_cSsZn|pua)-)i@by~ZQW-b7in2ra-IiO*1wxY?EU!y*z69CcWe$)b@LM5 zE}@DG?*bwh6BiSyJW|3e#ah3^4=1ZM7XjP1v$y-}R*_ixfKCs0}-0aQeq~$&!4w2bwzrP68%m<8#w#Y-=9cvO&n=`3iO>RT60lokJGpM|GN=Pvy(_H&mW}HyxbyQCb+%*w84&Sh&WnqBdha|56oV^ zRVPg1YJHOVI@*iLclkds{oDN#ro16drITto7E=ef5Iz|`UsS7Ptu&-QKmq~42uvW;w*4Z-2x@7r6>RL3y;PXY5fT~GW2!@TrY2nbD#19#a zyF)(H{FX^;!?s+5Xbmt^cGQz}xEYe1se6!B{|0e7S}1HDV}3)!t$}N)_Y%&A=-%}j zZSQqvdg4vL7;iI>atC|gm|~?2@=Knayk)4<7D6I~(mEzssZyN1iNgaJdS9KHkmIlhMV`*4poFZ?CoT?de6nSUUVejM1kAGbFOf*ph#FikDPD_OO- zE7O9dD>HBW&6diWgWAl81E?Md6NCwB2y#d%s9izF*$B&LZ5h^~FbE+MrLiXk4=}RN zzc0*Rs1MnnmJft)oADN{WZvV|YDze$3hm!MtY$k6tEAohi^w-Wcn@p@O9Xo^7O`RW z3Tg<-L?T?>DBAyX$&irPM57NKILBt{j9Es@x;aLFV}2dkU>xKsn2<=l9PZ4 z>jv*vJZRyBX?6Bnw5J;s)9f+ZVuq;g6O|fh_3o#9f{7g+1Xyrv1AoY}SDLtZ35>hv zQaJjO9mW}n5vURf37+F{1s;IxQx0q~FLdYO=z5$vwy~n)WYO?kP9T_b{M1e9!kM@8 z>B&_rZ3rAO@~xi`kWyk--@ydQqEj_+*$U;o!cw4A+`3629Z1f+k;S+c3B>*=uT{3C};zJK%#JH zDL8@2-K{yD2Jgr!5J|VyAw0cd-etfSEGm+jQ+$a+bw=KbPcq?i%304!H)BKWWHH<8MRhBSqFyB35w^gJXFRlEz z_K+3?`obeMTA4}?in~xI5+RMYB?R)6l`laP7^k85P1#qco~PNsAwpS$#~ZVbr>DeD z(9&oMderaL6xGhTibUP|>ao{k-Kq)x$&`VFT#RKu2{i)fdQrzvmSkZvZXJGx@Qdmi zmzT>97%!>mOFY!7*%i%^_{ghaB937;R9{?wRjO-*%9=1e@33_7#_!`v$t)(@(zmDg zoy8EmA|O~8loiDN+dXgL z+{}R$r)xEdk4hCPBWVIUsa5ZPi9n81rKoN9ET?2bb+2_1K?-dV`4KHFGze!=;wf4w zahu}40jG=t_oQ^7w2Ww!HJ7nTjPa$g37{QogzF>*k;aih>0-xKG6^!9X~^UYM{MJ0 zka8J%$QI`jRpid;_@l+8pyO5xde)^$jAx~vNuWfqsE`_vu+YMuR0&5&ONYNFnZvjw zt%BX~3i7i&yBib4WzHdjR-j0LdCZbT0sDdW{M)zd5Xxj7Y|iRMKnX7wk~}+1^kjnv zmZJC?*%RfHnsnW3Gbrm#eW)&Lr{H0#ciOC%v2H8H0_T``O<4U+tTT34+WUdXJce)R zVmx^@v)b27ot>{tdelauE%OJCCqDXAMmj1OAP+!_<;^04(5%*k^#wW?+&a&eEbv(TV#mK_T<%j#McOKK4rNjAYMKXZ_kyU|x68xa&m5tHKknv9*GZ1iIe z=`&o@;Mc-(>x&;qPqx*O z(z!%VquMqSY$u|mZ}qj)qaDFZ@0w{5l$h6&C^|~7ifZ#Pp2!7F@LOxy<_$=?{hTIk zs^}`;E}{9Coc8qtXN8-t6)MQ;fXPQP&msKLP)$@(@mzjVX9l98-|CMebcHGrQz0lB z88g-pQfC&jr9<>zhipBDMCPP^zDuAC8v|p*NYePgERtWr!_Vm`PDzfjca#_fqq!K$ z%2`3M{5YX#d@#AM+6Aqa>C{@Zjn#Rl!>Uq5tnXWvl#lg`@6D@_z+}LaI14ukrX%{MQ>IYA%~}(k1o6C_jbjFcS0_1mr|yj5 zUcp9x4XesHU!$IHJ`Iqo<3qM&Nsl-o~TkM6WGUTyoWa!K3ZQ?p@qr z{yA;aX{X+Ek(%$j;6EEcExkntK05v(Ad!wu=lXJ-JP8?DwE* z*v$t;7E($URCPowajJwv#S*ee(Bv&KI%`j&+~D*f2eL*;+tH}m^gFM^&8`X)3O9iME58}s$32Uh>+wb9w7IS>7UHa8e9uTFPDKW zg`s~EeCv$lO(Tu?2TBYjhRWgWP+T~_jO(HqU#!}bNKcLmrrIecya_SmuHRDY^tXqo zQo1OD)FJwrIc6IY`rWVtadiQ_lwlPYs2wRi#?k+_kBLy8z-U?(P7ZmuB%GL}$Ow?H zdclB$%W@QT`}Rf9V!mjmpH=E30FM?~M(8n`QN}F<2;kaTTZIxD_83m|F8{%FiItgZ zTc}5Un=3MmdHO?+pBHsKQp@=et0H1_*I=`FsR+@AJvG6lB4#6Y8t+TA*`@f~t5Jr$ zlc;)Xn3ZyNf#3MV?8NM1eKDfxs_cmmu_vmd1$JQiYagxiw}P{_Qi%yZM{iViZ)YUt z1pw_jpzB0a>W)0()_xM6HFUn%OAv1wU+-RqwKaWYlkvWS2agvX7jKzfith^^qxIMS z*o9FutH9y$^M?{*{~yIS%YXQ6I(8cjC?PlGSA6O{ld6agU@iY$|XJ=}Z`wv_cT ztfYw@0f9Y%#cqqS_+(l(sx4slAdu0t=2zNWF-DVC@>8N6!C+!kHYiRnRw?y1uysr{ zicB*J#u(IOSRLe1)2uK$DuF+p469IBt)UokQ~ADHkt?n&!?pdCrm7b$qs^9x^$3b> zs{qBqcX!t7=7VTR2Y-rvM#g7$Gxt*6HiNz*vy*m3wRGAc#_x3h@!8x_nE%sfyBY;! z8^HhZ*}CNvIoTT;kZJt>@!4LT)9*BsKtB_4a`0Bl=@WDl-uEwGrAiZb6Wh~eHCuwn z?dh+!;w0P5ct+tKsG>^~U)U>A%pr0?4D+Z?OP#qV2a*^LLj9|G7<@9XFd!{Uthm}*%^48KDOwYci+chh9tTd69jkxBZVx3T?_0#jH1}h)DnPY zidFj>l)h^(3$;;T7($|INx9V?$tiSnhxexvyNkGGbs8XC>HkP`p~0mI!14E)QBxX@2ufBVdx z8t4%WbMQOPVX@dKZ6~KF>)ul`jISha((nJ%+C}OzcAE@8T01~)lMw%23<+Gqf4!jv z*A?ptIIuv+?Ua*6*MIw?4b~ru8p) z5laq}SP^O%nm9QEGO?5RidmLk^4;M&8JxCT5`Ap1?^76G#I|XwiVEA|I>i4QNb)z( zGzJrHV}rt6DSA_L!ft+8jc99_ZKaZyr8dw1iD2L zMQ<7MCWbegv!i><{d=5}fUh+`5;O!Nv$BTU@C;cuGa_J6hiZ^xN?hMHFj7?@NLbY%Dzq>$lBfbX`_Gk~H%sG+lH>2P z!RK(A~qUH70YDtktgy=_pwX~Wzmhf{#FR|wWx=(bq z7x%6aFMpHsO)l3emHrd(giCF&4ZrWZD9jeF^fg&}G(_YP@~Bh=$#HmS zBPVWNBG9lJ;;{(t3fzJNjzvA>Nm@M-7cP{?W#qv-*dRrNs$bB8KP^1D;1n;<^hj@# z1Vo`Oi(ixbyc}=Uu9(Tc;ph499e>QQvcxz-1Rj-3>Kks>!wct@7hNv-*E)jB-|-mfd1N=pnv?OM&MRSanU9CX-PW+@w=RRW?*n;prGBsXrfD zS=$7%TRJ(7+;c}1F>HTf*qM4}KClG2GyvA*T1fjmw91%wdDJF_(2=2cSFrUEK(9sY zpQzRsL&QVyg@Npw_f)h^Z1=m{S?XvgBl}+VG$9iM`+eRot$PkDjfb$2AxcH<)t0hW zIaibq&A9enBhy1i8cAU&{1{L%Uq~VF<52ygw6I~*8+Pue5oS0hnkQ3io6XHoz2Dz zH9N6WA$4YgOk4{D`~~~;EgR=5RL1)Zl&XA|PJt5(T01KzD@QrfEMm-6-it@PN2Y?2 zHtu43o%X?t_KrVMtbsl%o4KEFJ2)zu95*9g`^T^N1_(byJAN2)9)jn$?eqA&p>G#P zRwWzoDF`P&vb#i=^C~bF*fO^Q+FNd|47%Y|KXMC{FKH1gRTj# zk^P1oHYj9SP0lsTGfG@8Y9M)I6#Cr#|JGZ6`+#%F5*4Fs&|)*Bg+ANsa(8Z0O0ely z&8dhM8i+$GYo8QonmnAuzW2kDuGCYRDXPWI^Qe3N`hGEJSvIwi0uYOnUyn+eSzaWl z>jD|RlwXzK6VHz}gwva*s%r*JX-!Om*=LXj9u^ki8d%aEoD(5r_g?8zkZ*@G>kztC-9fq2R;HJy!U z_SDL~4NP6E2Ljw+KeF+HK6}HaBER2z1?yC#ZzZQWoc8f467e;$ z3}8Eq_p7kk^H_R7?M5K{8gAArx7#CkP50N#pgnuPf!#&%iq+0zn4^%gp#ab`7%@yG zEDb-A4dUMT%eLhp**R3qz|3a%*!vTKH#13SAnO8c(+8=enkfrz4EY{K#Ap-#drd1m zfoidk!hBt@6K#ADslouh@&oy1^J7b)bOQ27NB6TeQ-08|6$=;ja&i}#Df`L3LmUi+ zkKU-mKK?6k(Tq3z5!Q=?P#u`FQ!tLtk5-bGZ5y}Uf49L@|BafPO|%}sF98~HN>j5# zzWfQqS8+RFBvw8@6$*%?h;W>TWg&BNjPLKlmEAlz@M;gfI(D2-7IT-|<%AnLJ~qC) zc6(b!zI_yTXZN0uJUz2zPgXxwoY=Mn5_`>Ea{-tTdI&yR#ycKt7K_29FvAolNSeb8 z$HwAlg~&oKy-9{|b>Ln*9o4%}mE)wwdGP$ae}zfO;xCe`>=EH(6ksqh8BfNA6LJY! zewGs?2bM0}I;JT{T;?ZCuAe5p6^YJkmlMQV9F=bllhY=Nll#oUoXlbEy)uwk7}~IVLd`QFul82|*z`ita0u z89LRrz@&SVARwgqGilj}jp6sTYu#KVF2KX=f6$aNqIX(0uybgFcfvh(@j)PxMg@<` zXK@gvS2E^LY&_bH${rEE-df zgS+(icyn=84!B9^v0W(?e(;jF{0zevYaYFfYdha$$h(E_Y#p+<-Q41W-Y9 zsDHDlQ9aUJW1ea5o|_=h9~5mfa!m#Ub~LwkA5*Ak^SN>j>exCjtOL*7JNn_4H^#hOVxYb8;dQ_BlQylVM1*W0)?M%B7Tt>N@5igV?jLk zs{?$==~I~Jxb}PzvmNYn(|#?y0eL2$pYawjZF$xXMT8J>XbVmTd(*+35y1SF#EIIW z-oMF3rOlW{287S#czk!F%^`hJIabea(WSOJ3fs;sVUVcg>dbd1KPsk!c`E~}b=SN)M-E6xb&X#} z$4!`2xy^rvLYf2=3UC)Bt$vjx)-Pgh&-f#PU1|O?kZn7uL!2hG$bd~HAM#R)-dicNtDUG@I4#xY#!M%LQ1vVFM6X{t6h7U z4~15lHFfaUWXmasTl=}-(YA}N8)$oeO^O~`^C1z=D2L%JY6P;FP>Rxs&@9T!>c)mE zI9fEP*cxplUX59r;)Epv5~qp>6mN=`>nlR`HOhnQ=86yS%Zt!d38gJvdv5uV>Jy@-t;IZp2R~ySt@T=bIXx?_U-8|K8v+5W0Eq^TTATKZz&u&%*fG zcj9brVq@|juzw-+pr85?Jms7kNVG(a1&~TPWU@0_lZwPFMQoT1>ClN! z#Mp{B(#YqTYd$FEwcMph!fq`kaIm+PV5VZc#?l|zI*mZt93baJth-2q6`9C7K~ zmThG!H3MLW5wN0`iZ`upRkfHkN8^9GV7iiR*R5%P0s2fPP0ze??ff%kQZEcU=37m> zw>lBMVY=Lp9XGt1dfWVDTDoX|3ub&5P3mpeyk6Gs4jh3gGb&|%HK$8o${QNR{$oF_ z0`#cP>_MXl4{|TCL?X&9YwXkY!?&FRs@oZ=&!d!Eh>EfYiQnuR6Hu&7-E#H^lef2PB@@`{uh<20XC%gl+lg49j zxE?huC=2~N&Ft1TVT#uEka-NnW3qP>}IEH7gwaJJTi zj4U>#6QUGMoPZSkz|2cFRJiRUiQ9QSCvQJ42DM#U7G1td;=^eVWKnOCNw%9>E@x)n zY?PVpRW_Y2cR}m8OGHS>@E2l-e2Vb^mRo`SF8=Z$2o=^Bx^j*D@Jl&JL|cvO)>#-s zY@0BCq4gvNzz15kH$30fW4mXtBs>x&BF2Zs&R`~TjF$I$h<&_^X_1A_kT1>%LLthv z{l4F=EtBx=*9%}NdB-~*X-uaaJ&2pdVOSFc!lyA2cTblQPJ*{6C#AEawN^@7%0l5NsB)Jd@=li;4JxqiPM!Q7-tbiGvc zVvLH1f*Up+cgGSkZ(+<$H6>A7iN#PU_NWjZb(X=vbi5N_O|-$)u1`1n_&!s3ByAsI zl`i}P7|)joOex~@#2Au^wjhoLnoPhnoI;xsbts#zbioTx-g*^b!{)(ISC3-$(3$bH zauz11o2dB+GJ921>dKpuD#K#VdFvWWRN@TExEM>PCpS~Ii z5!Ytqwdt;BI^7J|1!(#+9Nv@|8d@@Psd)ox^Oed+fM9kjP-2YCJ*$L()nZ#3#>ib? zD&>KY&+>8W0hhh$5a3;cKw3K>S*uNC;fGHV-J7ap)zcuJ30kO~Q z2!|)IKn8cnK-cD7XfdP3%OYmzs?xX+fXAdLb18_P4)zGK>cZCscYSdy@@9Wbq)!G^ zfp32+E*qP_W3)3K3#jyyFf%nqDZ}jat(+oe>DGH>u_sJvda9blc&Eq7+Z1lTD$)~r zaa6+$U%BN4v&d~R>X?@BLH3z63=6LtP;K&zp;Sr3ESHdN!pzOU1)w#)dyF8cTNat$ zfjFn%Y$r6Zi7N~$6+Bpk0(;zw^#sP~6A9rxN?|JZjgtw+9E8XUpbjRc!+3GP18*H* zY!B;rc^HD*%86Yt7+ZHQPJSntEMd-xf90;ct(kILdFHkQZ0aw+n4gH61?TB;KxJ;$ z%{(hSINS&OeP;?+RI5m%f41b~H)Cp35q^Tt@KFhhN@H9AH_txjwGg@28f;R7DEJJs z;fbS*YvF;c&PZQ}G#vbCgr5wppL_SMV3BOvcD(pe^k zuGPSUDVLERjt;t!XI{b|Fl|V%*(6p*ryD-w*+EeUUoY1tw;-h0Kp~Y>7Kuv^IFdqx zm*eDKlPkR4wf>%S4=GZQ4aU6Owx9wEI%$6dmYM!GdEm{Ukc`3AKXEqzFujD1QfX(~ z`!*>PrN{lShe=ok5KfR&K8s;j%Sx-uOS}_aJ3tGyo9cq`3KueHUO2o7!BLq8bI&uq z*)AK<);wb z^a)-`@_aMszc1HJPzUzWwf;3|HLKT1sO#C5O4AyqN&b#pk|D)srm{~yO9>0&k#pp! zG`XH$$d@1Ou|_7C*FfA#)q|A5pZA|oD{FY=P+H0F_Mo1gieDvNZ~sqwX8}~l*5!MG z1rHD)xVyW%ySr{B?w#RYzRdUTRK0ptQ`vQm~x6j`)nXoYKzM~=<2J^U4*Ec z$H#k|TmI7P(DEE2^wwrIf4GL?awP_)+eA%@92X(9 z7!d9`;A5~Q97L6W52*>EFM562Sw19`b4u?-ylo40Ao+R5F;Nv{(zQx$XzE&Xw)f^f zLa0TA?lSx=;8N8knZil$d(v22zHzhcX7#ArOFu&3^8P*z4GEOj`r(UZ5?{XXwxWmT zP6k>m)Nsx83YbaejhL793vLYMzS1Oly{keyY)_9Kc7EDNe@IY!Kr9Wtj3k*trRQb@ z4RL8=G>nq7=E(->R`UVsU7Xs&-ZX1OwQZ`#7nGFYYW(=iqDTc+<=8FukHXXeM3%b4 z{g0OsL+N@1Z!8ugkpvf0QK{bga(_PvBR^;RUc6%otjyEhNX5uK*te`-!D>=-Xw?#p zABV?osxIyQust~KbDy9WSQw?tX!?b?%04tB*MaMJ>>NgH7na1*DxIf`6l&W#op2RL zr$$x@bZnqv!cSYp?t<7>8)h;R2i`xm5nt2%$mCQrpfZbs6e?H1KtI8c3oZ0~z43+8 z5k8s?gy2|F1d3PGM@E=vvzr$kG#$oZZ<8lgqt&N-c^QM!dRvM9V1|5Q-%q3f(YEVl0(H~i#-LyBGQePSnbIw?U2d%Ikpigni} zg&n3N5Og?(;T#gYUMIwNd2UIY8ja(v`w^XVV`(`Cc1SKak(6Z2T0(71;q})&ld^45 z8T6?Y^Ye7W8Ql2A(Qxng!lT9EVs>b4Jzw6#xA_QTp*LUdd-Icmb`$%Hkt5T;bWOk) zrKM4jfHXLml93A-M^tdG%;&P?f+QqtOhqctKS)yQWL(LQ544$KD(7<<9+sFd9Xt$v z?T4D&HZU zR?Tf57Enq&9feoO#+$s29;@!2Xtvi8C?}UzZHo^Rc(4QCSuAer;{f%ANzAIbQsMJh zGz#Xnvb?$hy(N`qe~G5@RNcF?ua=iDZAu0nCJ>v2SUD43AgOZOXj&0KsE9*(6vKxPT8c*_5^GW2xpL%!MQ`H%-tA$>UaaA zi9)S|T(@>tB%gSACX`@5yg=qE0Rp1=PU|?#nr?b`_pUt6P%9sO6r0yjvRcVvQ3I%e zYDmz6nDBDtwL#_AA&NL35L=t{wwz#&6y^GBR@@MSWCFMowR$UepI1VMvuZH6DTN*d zU$tBu0`-Q(St5?pS*uDKjMta>6N~IL;vl&9C=3Tm<*L@Z@|%O_EC+_W-!8_orNT-w&UG9`%JAg!fJL(Tu$?29?b2y{D_P;r zsMBP@LBE%4-99ktyB^h7lv`g0yHBVx^yLyJpn!L+b+Sqv-|SUc(9Iqyr-i2sHhhZ` zIN2XQ`^0x5wYzu2$;*1xr`w$Io=v@l6jzkGOS_8!#~oporE9hddV?K0W43CKvO-s9 zS|Xs!|LidMFlNXblQQqvFMHgg;O0bbL_6%&v?xJhWSQ8@rSvQR^FrRC9*4)WWAMYl zE$a4c^n{x-s}F?;+_Nn18pb6&rPS0ZDe9je&Qim+BKu^~QP@0!KMq^wxrf)QT{dtM zZZgd3+bckt6Rg)zZZT70T_%$n?sJ-F#!K`0e`dsQHneZH(|eroJl|Xe;-)$i8GXoZ zz2{rC2tNuQfZ&Jifk12%2}*Al4T9jm>V`m!<_*Ga;tsf@AacgJX^E2m7~L>ekKJ_R zWuBBlgTbae?pCopuz^ju4n8I&umutgvLrO1MOIP_Rf6_LMq$}qS`Ck>7)dKBE9sCO z7(?x_^3YrnNpdArkfj0ZGdPT^@65IuH=jpHneoJzFC~1$$c)%XdF# zWJ_wx(im(=En(44tceui;{3XcndD-~6%rCk?*)C77dYN+6Q0f($XL9BFS&thq$wXQ zx0)cC-{>E0n=>Ix(n<&RqD?h$b8k6z-;ubzP$FLgu3i^+%v3hu0FEvfufuf8{MOz+ zqYc_apFo>I=$J{C_|>6S+YH0ND>WwfwbbWfZKfpRRNVnd0VC2~9)cW&x4SZ2mWh%C z5<{TlxtaBq&G(rJ^wb0aO4L-vjk7_xad!%#_w#*&%2O5?D;A4KmLRT?fdzaZon)w( zd88JNCXI)=N#*g=-cc}2l~otrgbFDRes4(aK-uAq8SBedq6Hwy-gvM0zhLrySoARs zcaq$elxcN;*wCUb*It}onQ!J2Y4q^a1>PCowYu^U=NV$sq#X~u%zRsmMUlby^|b6Y z7A7Z6%TgJ)?8HbeR#rK%-Uk&F@Rwz!Qz9%`YF*@(Ycc(*k}LwHbLlN@`OuZaORCY- zbC;kvgKDx-lb`6zw-X%NXBu*jz!x=3TAOgL_%$28Cvn9=n$kJMF^>&xX*CO@F}C(f z79rHWa!=P=hQ2+B&!6!w`Wq$a0zEuY>9mWv-61`ma?i`{ zQBg8%JK(y=szm4^`5dZNYQI9#`;hNYp>FN#IwPar8_iKCcUR4P$E4r8lHNPi@gA&t zwrd!6NUsSD1ksIJSZPicHZgy>O1)kWtchxYrP z66P#fXXZQ#$MWVJ3cC?5!jws`n}94V^RBLZQGxQTORHA1-tSSj0@5Sq?ZB9iAc#@T zn|J+MNOxL7Hk}vBj3l>Z1f(> zcA`Q%FkP9LN>397zKLXl!#nPuT!TD_vL7>V8avCeIHe$arKJQZqUmlgpWC;BKcp87 zALSqE>LF;!WP-UDuw<%p%qGqBmJsFjgrZ^hkfDO4pZ9(pstjlHc?@R6W#(Fb-n6gG z_$8StZxd%A9PZqlpTl5VHJFmaap9b2R(_z_abCVyP^r%3&DlY3L>3;pbA>Hq^jNoE zDk!EvY6cZ;4fe9!i)j{zUds&L-Row1B-d&#CIbV z_^rKYs&+^BT5_|(+l_>?Me1nVDz)drZ(8)KU9O_ToTKteUEc7iQ8$M4%}rRs1n2$<~Mw@q|102qKf%gLnG= zmqi=hGm6570mJPMV0}#R$FTdOLeQV(TK*^*^i;IvXThMMh<4LXT6n=@-v^&LH=;#< zL~%nMalBc^Yak)*Mf^)~W0suj+dX+ve%x71f4$Wq`u!uPgy~mr;UXxf8i0K_4RDOB z%2qv^i+Rgg_m(jssgffy%ye{SG?G)I2~zp>l}Ni#h$GJRU~VDb^s*CYe}SQ+kq#Ql z5*xu;4#+Jibc&)u8`Gwn&aDk+@c=P(k|w&Q4B@BWwaM8*ur$PR!N`2TO#_$gBSX^oFze_F%q z4qy#J5?^`mElt3gGv5O#X%GIeM(q!4s3!4;bsDVD9`3rOPQ5`14WvA90Dr)mihZNG%rh&cjYD&=MzH)>(A$mMt4pj4k75l>n4zha?B^@7* zfD;8KIg`#7Uo3YNlr;O=9$uwehH1tgmtYyR zeVh6t_3ioGR8h9Yw*)XG&;tey1VAx0Ln{MWTPtfj8Urg^qaWP>pfvtnLk%!)x<~%Z zN$@)cxx|4<+3%~E4TB7X{;W0L2?b-^0NY50m}FW_#Hat|W>qHkH3?eKvf$Yk!5;F? ztd&KSMf*#OVec|EK6=^-a*G-0v#O@SQc)CHR3daUT|QTc&4!2Xdem5PGo4OWSk6=M z6{7TB61BKA0ig=$o6uGGLzx|2Un(z(+fZ+q9Qa#;n8qqdsg0pYqj~vW>K+l}&*Q}; zhg)tWfyzemgHWNuL=U5K86&s@=H;R>1V%;WPN!0hHevJPq1|HlDCdL+4VE)XCc&&c zS*_SDT)BDM3ozPZxEGpYq39U-JLlHHd1Y$%F9Ho*q4fD&;f+0tRWe@{n53aKz4vXY z)b^coLhKd><8wWveOMFa5)*VF0OszZ@t8?(Ilm}Ap)f`tPxH7?PhDQx;n9Wm0DdIE zvb-#t?sN=h!NDhrO50dUulNd&bHAkWUYVtKkV8HW%)q7HR+(C@f6z|by-#TAvA2N#L;xHe{;y81XKnr4#{q`>jzo@K>v?bQ<`vA)rB;I@?K3;uzYCOQEyHz zPxQD_l+M%#a4NHMyN+ot-T+ShUmYehh$g=Q;4FuLRXd=z|6fK$2RnN!i{C!;A07h) zR23~}wn&TQyZZ9>mGE&Pky?e3K^~(Gl&=F)U(zwvh$~Ju1vA z9_w&v5+;#(+8~w z#yM@+5;=o4q9N4}&2}Creg@4dDX3O=3*SR(1W1x`{CCDom!}G=E;1B)d_glQo_}W3~_2oo2p}5=}q+^CWT?1dSeg4x}I^ zUIk&%?YZSW2-l-h?n7v}{J_!v_sRZ%a{@iK86-~vNbWl?-mvTfk3L>0n? z;*flGqWedYxa#}aaad>;Ohs(%ZBBQiTbTi4@-5+dc~%D@G5B_PInggv#A--JFrEp% ziPAam6)mkwWg4#&Ciq5x2XejNpO4o!9JUI+LxDcs{KzQHrMZg!8Ua!YpEQ!ym+>u< zZK82JeF=eBl%fLNr$EEPz{Z7ZzuQT08MaOv7>91QY{-uAJS9*j1JZS|VYD<7=@!L2 zhk*g~og%yWI=fOWSJ3;zh4q}gH@_YuU9n9sSpjP2HC!Md62RpJ#0#wfPp9mSY!!^` z?SDL`dj6e`THnO-K*hrvc^bL(nm3SEDR7AKZ2V)n`lmoKNnlS|o7Kgu`KuMnl_LV1 zM7{ZgmAKmBmgU3uZ4&uys=|4hH8JHdg6bxde*8v3ST98iU|SnNOjv} zxIgo{9vw*xi42~<;+WiR@~CvXT5%hx-+36hPH8NKtiH4LJit&di3cs<@jdo!4tS&Ix%-DxwDU z2;Uja&+H47Jz+$me`wrr>ifc`ov!lw*|QJ#i1!$DTi0x0G3oMG)V{HrhE1f80>%Sh zErolUE?d`R+c%jIQ2QzH?VDims7FNlZ>7FYSI}B#Aznv>D%MFkhv6ZpL3Fwyr7{OI zrh`#?lJm{H`WBN9Sw)bzKF2L2z(6R$Xd!`lxe(yLFJLq=TFr}4oc3L(`FMu5eF zl3>N2R$N~~1xsU`X-ZLp0s~5463ndTD_pr`q>MZQwy&BMYe6SWiH7PY?@UuYK@Ptl zJJNPSU|fNHdGd}N{8SU{9rJ;+g3PIexNVoO6pqS_lyHkRnGU$AVf*xNF*TGJgGr}* za=|IObkURaC2e#i6lpK?m?J7^i?b*Bx~t6fd(*~|Aqlt>@aIK@f!QE2 z2px*kGZFLe&4FxnC(H$X&c#2a4;h^!Z!nQS612j-7iNS`qNczd$2&xZ^oIOS(nX04 zi#LmCZWAR{nM}!i9|~5`69mpIQU+nH-xw672>tFWJ0X|shZEYwbDzl?GGK6&U0)Lv zTjVl7>3F|x3ppjP>g?X^Nz0_M!~WLRS=&;!gLb&%d11|kqI(OLg=>JJ z!g{l*(NY$gXuQ0foI{C$%;jC~BaOVkNx(pE$rWSHTD);~gPmS^=w;jKzsFFZPNZ)4 zWff@L6`5ekjJUUrgy`Q<5*06<&?zwKFOdx0vJ@tgt3{omd~)vaBJ7EaqU) zS7laG85n&waV7RD%Bg#Vg)UZpjM$B`dQ8^0E*T=93v8@`Av8at>|yMI;h4Ocg3>eI zN5|OT@5zT?BreP(j$#l|Dic!F^QYEnrKPH470u@6K(YSb=*ocH4|Nw^c4}|VCEy%Xz^ogD)2nPB~nb5 zqnGiwQ0dF=bwm+LVG&(a`J;hI2bVhykEPI(l(GE-E3MyeaOO%~ZPHoT(iK~oe%#9{ z++5Q2=3`XPN6>{7Dc!9$^n~QclLyO~3ujd*|2`LqC`yU2XUdn(giXNbr%oQlP@%_X zVWA*JE#b^kKO|zFT;0Z%FZls^N-4n3B83+Y&X!S;B%ZKSp-JWaH)yxsuDR+0VSRVP z9pcfK>gyB&5HItIt{!!WmfW(@585M6q-9?OXWV{EYaRVsV}U4n^fw~yVY4> zJx;xqxJsL}q{?2}}SY!>5<6O4v zU^ngyez&;3jA5V|Zm^n7Mf16^+qrt6zq=~c`L1nbdwf|~0}Cj=N_{n)xOKuYi#1QQC?t#kpLTGi`KSSp7gQQSkjq3h_5w!@`^N7mlyN~^`u;*R^$O;i{+#Zlh&XBs!B z!OZnf31173I4DyktTe*SSSJ~*>hxQzR;9D@-kuB@WPA(p7;R>`7rg@6anx$6xN_M( zygI%)_sy9sJi7Pr;9pGRVyzB>K2+OGQkVM{mJmDCbh&<(FtASXZou=dEybhm_#r8b zIe96@ot3%t+QH#;uQQF(Qj_hZBMSqE{d;B5-isTbplPqKu&-L(@omkfZ{j?gZ&i6? z?$Wp)KVcl_*J|I2#$WY>j(JV*r=q97O4d0Qf3%Ph!QrvB-hbT*)9j+lUVGfUhjS+s zUp{-_zH2zDvQ5o#GO|PD$+-h#(a^M$bhY{AIMu`WG4T9dk`rd4%~8Zy>|zU3Yi1_@ z&G^eW@*VR*rB|9%E(YBTsuxy?VHrVZVNS^&JAN;UT5Y!0MzZ44*j%nI(|8u0wx@PL zDDpgZMkTL`IV<;W8+jmy*&L5Qb!w^b9JqM)TD`2rYPEKqlxitZGV3?!Of<>Z*pppv zRe0eA3=9Se1%U)liYv>T{(<-TAKD%XfgeGCHQ#Ou$nDJmJb{S+$CD5GACr$Dy#T!v zT?j)618cA^K;R&_jQX^OV+MT&qDG`|N%i_(Ot`}I!(7AE8=Ini_z>Zna0u6l?XvR- zdk_qh3{zTn#wL%7iZ+Kv5Fd{kkBT23@5PCejz+{AKOhS_A&95@aqs20Brhej;oVDM zdjl=bd;Si4{2j&;Do;MBND$j0mpAkfK%FDlzxz!95tr)g`$ zw~5Z|<*;qDh}WQBY^oNl8R|k@!kz*bywUAjI|tf^1@n{DE$K5P1<5tAza9_Ag7q4Q zNB#Rpx)ekc{9D}<7}yh8;7fAy?WPnPZjCv)&F>HMSd|^@Rzq-G=7nEk?V!fEjZhLA z^#+uOLm?iD8>3?H>=`LM4hBol!){AY;;u=ksV$)%^9fJmZiB8??M!Cvh86?`s1ur2 zo8DP&?HF!EyS*z6Iaa6ci(gM^X|ws_pEKC%dZ&Ip6eV`h676J{plq&D8sAi96(+yG zl;ni?dPqZi`c3xvVxtocnY&q4RuT(|i|yEY^98tMU=;Bkl9fxrp5^!2ZLhNFCA@x9PW~b60z9_X>!ZKH05;Jd7Mh+!i4I<}SRe$5 zW@XTjpFJF@`l&8$Xp~>NVwc(t)x}oLBaT$!OLkuvlk{?tRlmn8>%~nE+xcUiL5_zD zu~kk6o7GkmrB?DUkGFS>RoZf+k7v`3y+r(A^5ZT}qui^Cj<>a^nYM?IBU00oNOw1} zL^stQTpfLppTD(zUFa-afJKU1EU%BktL;=!@XH}wCKS9;G+r>uzLP~ z)6*G#_w?UA{dZ6QU(nM5qJZD`^xyaN-}m(YVfS>^AS>|P|JLlC>3?%i*XaYy-rrl+ z=BT`0N!H5AzbvffZ#O#ZPVOphZb_mk=r7jujH-H@A<$KnNippASoa9!neezKlzwt! zu$)5k;26Eej5f6FTyLx2laQL3R2w6lpfWB zS}{H;?@YChRyy-83F8#8M@iLNz6rM9&yld$y)Ci|+`rIXCPAjUkZ`)+tCsH-u;O;@ zI%My(fNGU|4Cw{~=5Z%34b^DU>Et;nxS_Jhf{MEb%kfOf^Zh;&-RO>IIXQSM_BoVc zqC@ zDk>_E!Sjy0>dlLBjd>3;&bs?rYqM7ay4UL!BEV9y`e2z8IO2u_M9PaLZtdHyjDwGk zTTzfQ^);T4`Q2Kpp6@2JD%IJ|5RIx!S*M+El>XZgu1{y*XsdQQ9PPdm+Dj{4z1@u&JYLcCXim^0c}#)bv6Nr?kdd0V zN0k#|lXdOP2r4X~`+xuVz>N3n>HB{_O2GW1tN)G?{Eia*juQNi68w%5{Eia*juQNi z68!&)60rPlUT6G{68w%5{11&1r00%Z5CS3yKi;$i4_Ju+l93Gb%nbyL^-L^(B;VOb z$(i~5nRI5mIvX#xNEZxdpmfu8{~=usnK3A${QlP06mfA-8~8mSW2oP&D=)1Dx@9YS zk&iCD5gxdpFvD^>rhP}`;SAg>)wfWHg|vT7#gvt3RK|v=A&4bY$xBW)k&j_?g9ER@ z)R5-f5;ce3v&FCAP33iRiC|b1Qnh7v@@;67-@4biFi>P26frni2(B?&d?kb22Ucm7 zqX0+;jH(Zly$DC$lFT$cI4Yy|crSP22qH_m;+KdvLKkt-@RPBv^ujPaB#x#>j3Y$` zt1tVUCbZx#~E4QXn|Nw&(k zE}%~0;dKlbY#FODvRx_%i52EehhYt&v`_Fr>J`RZgeYXAoA z2mGS`i{*bN+yA%B4-?W3)$v249)ofAj86 zaG-31t}c4tS!KzgS)}s;IXqGwNhs%zV!x9LCf)JZssno-HrF3A6jFdkVgvGPpS)@%bozUV(8;k%=@o)D=erglFe?8U^W7NOKBVrWXnQLE}~?*SdKQ z<>}_@qD(<%+Q96TnFC^&SPF5}LBn*+&(ZW@>}pQc1bYrzR`w4TyjOaK0#>*H9OpgqU-vY!$`rKbe(Gx3zwi9%r1I8mjN%!`J-pN z0Hy+HpuvFvnWjK^K+)({ZIu8Y$N_x)QBv*in}&cDApg|TUPsZz+Q?4p$G|Tu@pr&$ z?G__o0ALl+^#J?$%XR>`b^PH>KYi$V!;Sz5J}N*lw=}?u+yMjTUp5?ob_{U8`5_h5 zx3V(()n0w+wWZhqdj$pn0GjTAjWobvpgMo*MmqF+*eisYVCo)>3-qBzt2lJZk$_MGy(X6zG% zgzcA1@vhpijV3<$nWwl7l`!!sl|zCt!ujzX3jpE1v_OixZ!K z)vEsn_#{<)4ty^4dji&}{Ttwu@b5YB?`pXx91u{CIuOvmXy=|c|GN(9ug#0J{?hzU zRnqg;e;1|vwe=F9PU+Lm|5Kv!9Ps?%{S#tT|ED?rNy7hg==0RQC#a;^KY{*C{{;FoJ>fa@dF=D)^r4*o3G`GU=Ie?tGZYRtGTJH1Q=iv00e{s_?ZCAV)#RTeEUBoFvM>F literal 0 HcmV?d00001 diff --git a/example-docs/vodafone.xlsx b/example-docs/vodafone.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..4467a9301d55f503ffb6a5f61a86a89158198a43 GIT binary patch literal 12541 zcmbVy1ymf{wr%6??h+tKaF^f^G!QhnyEpFc1PJZ~w;;jY-7UBUhv1qRofMJr!l4ps@h}06f4UT1-7ZUG?)a_=q1806+p@1N49<){e|f zf2|<^=-?B-f6C*!EPGgxLr%RBw>%Hfh|KyaSASXK$p3_?%@JWfJW?&<2-g<{H-12NZ}to)@sM zSSzrSxgrO&H$prUB>Afcs>id#OJg~;ahINS!8Y& zD9W0~gGP3WV$@`#9YQ{x7n%hq&jW*CeWuKS37_l6ycBzYU` z2{@qvXkc&LteIWyfL4Zfc2@u7%m;<8x8O8yJ7u6;Z@}g#RN`DT5xqh^A|`D3GDoF~ zX%6zNew)byyzrd3xpef-IPPp`6~B1f*;4@ssklzlX5M%_Hqq*SXL~plI#P6tmo%XB zIRFqp{oa98D^RU3lWb~7cP~(tZf4SP-FbF%s6h``XER2^VFLm#@uI}sJj&kjK1Q7w zik<|;u)1e#m>x*}-KrF=n{L>1kiq0KHRC4U?_p{p;x<(dNihB`jnKoDSEE&>RGYNM ztH^2qlhxCJS5uqXZh{2FhZ!Cyt$fKn3H_X0WYN6a$gbwTd28S$((gfn7}X+O2M4JH z+z$}`9i+dzf?q@RzjO;@;BEmu=+qtY%dMu1rVQOjM}%PQY4Zjl_j_Aq zDMR=ixa{7jTK4Q%weP16irFb8Ft{fPzuDKK^n}CWhO<-r^!5 zoZ4fp4uttvs+eDJ8C~{kFo|JWJBC{88{)+w-tmnB)$j$8-J~668xBwRKAWZXIo`i} zO8v?h63R=^ZnzfzW}AIr=+2u_<1LHF#nxdT+V7!?w4I1Vg#`d+RR932zeD9@ZenA? z{BzCvPdRn0t>c)^gYA7$80_=6787EMq6rmQ&dJV2cz91f8)XA-i3>mp7p^ z+^19F@3*kEFy%oAX^U1A7gz(|a;zmEbdrqd?>D0gqz37=crnD*cWCJxTD$bO^J#Nj z++Xu=-@S^CN6$9<#?_~S9mpaV8=H0E9Ot;4OWVhO|3bvcC|gZX7yUESMkxi=5#MaI zP>3Xv`r|k2x(D$T=g7fP(*2m@jCNOKzc41-3L)wWXmO&-KqRKw7=j6g{_mUPP}fHN zGXU%mAf~rqmV3VolnQo>QA=Lrc@bAT>r+)AS<_zpx8h*3Ik%3NxO+1>^WFEHg8N-D zUeCF~XqFGfaHpPTwn&vCR&kT^r-*6$_)-}kC)&rnem2eCg{lv&NBTiJ0G9gL4{%; z0>%7h)`o0F<_>{$Zj<#aj`s(R6fefef;wE=zE~!$S6IH(hTd(GO|_erNDR8DZJTOY z4$^R*dkcIzR4NJlbxS`mKlCQQqE8DQ3(^mns>87(BrJtM<>VrMPGM~TkeZL7+?B(4 zmh*lvAC@#f zA$1Bka&pDWIZnIwHdTOPt|~|0VdqaO5y73&jvhD|u1D7xx)j9jgU()yj4>5BHcR9` zbCqZ_p)}LZc+t*OF|G6U{EgLfTc^4M&p32OvIlEM<8v41cEi}U zwXMKR!HJ|Y5Oc=WBc9FDQtWd^?>9K_a3j7)RyX@<#o5jQa2mtQ8W=+Qa>ed_Y`xe+|`Zd-oq%@D%E~*BOS*|^=8JvL+XvqP4U3R^&Qk> z9m*Hd(5NnV?kBxl!qF8RAJ$#I8&6n&IAt_ks2@B{D5Flml z^%igBG87@VtUAn`_84a%x^7IPS`(=T?V_eU_MG0)iPeZiPxs{o`~VU`DE1fG*PmW# zABc*+Xc`U1aIDtA^a)IwN1GlAKTRArk8Qpcnh~ZdAEqmViB7v>>&QM*?ZuWA8Pc^CUJ1}$7xvehs1mO+ zT9;v3ogrGrYn7xhBlnmPXNj43nT)9SefgV}`a`f~?N&N*jA#+F^rp*)absNa(??nk z)T$A1(xhj5e8P<5M8MNE6~@=OFjjV9<(5o)P0#P9?gH|R(*;4j3*E==_SWBiczeK4 zmZ*KAXOf{@$K)1@YRey5E%|;Q3Y>>IQCrDuWpZ^TgKueE-jx^%iNhyFhYB+Y`}VaL zAClD%KLo!g4PWr@pXPhv{qTj6#iBmZ^uvci^s@BggRg@6dDT`LJ{CDFdJnph5|$BS zFTfDrQ0}sHCDzhimHY~EKn-&>@f~A3yxWQz{&#W{Yx#6k;e#U3$+*;yAi?PiqRo5K z@F!~j#q+*B-*?-|o*aUUW4mKm9CX3wOtEVJ;3+!yCFa?%gyU#NVfn4a&qAvbx? zAbTCjVZ-)>k*Ss+dMNC*oE)Ar|LNC&tBd%AHsuXD$9C25ADVMXTfMC zBw-Lr{OC~mF66LM$HK_5OW@|}z|C2Lo166!FKJU*$UN!DDS1*IJ4p8h{QgSe#G_*Z zA^-qXTmS(6|4H)>Xtd|Q8AI97@~$!CUSDdpug|*funN`2 zv?YA?nW+_0tBkzZDR3JXN}MS2_A$qGYP!NxZC~a%Za-|;d_NJosJ&YnquRcB6I!*7 zqI}HVXYkZ`xR`wC;Jnhh+p#ltI>7w(sKC5ENl$aqG>w^5J#wVjW{OlU*+p%K8SOHA zux(p}JD==4oASB1?$k`@%+3V#=2ooUJEh@q)^6vGCeHFiA4R2S^_U)4kkBNkJQkB`_;)eZ9Xt!`Z<&^{CfxVhmlis#iJk6mv|ULv0`=k>>YhI!4Dt$|YYaZG zQAHZy+!b{d$=A4RtL_YZ_AMPfDqA^eCAw7RFFw)KqdlvHs6ujM>w1aROCu#0o2G9Q zFbdrb4Ft5ttkAUf=F`*EgyugMu5oSy9;^l-pyy+0!sKI9$T_<_0N*(C5rPo50hI7V zDpIy+L!+^b=5VvGX+~UD?zez-Zl_tSdkZ~WLE4T`;r>*@N~V(BBBqcAkVCJgKp+e2 znDJ1k>8w8OiSfIm-a$da?t2O+f$jLT+a(KRDgybf+nK6!j*=y*ve5MF6yX$MpzQ0l zV-JDL)Ncm^#5V(@A6#fc+$8yXYN?v*p{NMpb>5Z>my)&$#>{0cMuKlF-|V2TV>Qz5 zX45K%?l@+V+anbp6TeLk8DT(d(_lh{--MnGBp~NaJIo!)=Fj4P#V7&O5Q#XN3mJt^ z@N?G|4@dZ&zit939as2Wgm%npC`!M<&S_P*ma@qs`rd?I0kqJD5k$LW98rxN-K2Ad ztPn*tcoPT=VPkVThGrc#<0PB|y3iUrR)rNTqtR-bkQ5V?mck{ZX3F@NEgS*Wl$Yw0 z;Apc|b9`;*YbjouwwA>pw5aDIub=foQH-J$NubYfvxOM)gC|$|7{0PyKZg!kf)Ihi z`ZiU;l3m9Jy)O(?yhPLk4cdcz63U-6tQ^kcUOiYbHUo9FI^#UWVvJXz+#NO7SYW31 zAcK3RELRaHxRh4K1k0eCxD*nH2H81;rH0;R*?o4l`HEZyTWM)4o!Xxf>aP~#7NSyl zXxYC_Vr0AJu+`aHqC54D&FA3114?=wLT6#3lK8VjVF*)wv0!&NGlkrgYYmSUsChAw z^8$_kqUwdhDJ=*8#iti%X5^4zR9wL{sHpP7xkEwEn96xYup(~+U4k~{BzK8ZNkEer z#q8)1;+W~LvEaOEHYSWky`}YCuz8%Y_22A|8dHrGdKVrVwB-Y25I=!e)&IA_!ckrh zSws>U#4l7%CTwvgGknU|Rk}*>)WM42)uK}>&kr^T1DYvu>D{`8S#nys?l{U?X~vI+!pDCt>cdg~zO_N<>VhEiFqsnBi%8^DHK$=*&4|k;4F# zCGuq@GyEOcy_y)3wYL%o+Hc7tFsQ?n5D8JJz5t`?VVmXF1oSqG!AxHcUaI-a47pIsF z91f*)dSy7(!cW6a-2o-;FnqjWsX93f4EcHr8E+V076=rOj|j7YW1G*VQHWYCCz4dc z?J&p5OiKgTWij6~QxRIEW9PIKoFts=Z!8RrE^X5+=Up_z1YbirI`gwvsw`!mIq z6!pmoezpfW(@DoIRYmz~5(szZ+6Z_;$LrMFg#4jQjL-OY493hY_7ey))IPr3MQ00* zI>*{IdhRe3R5+W4K&8bIGnLc)GE{fCs}_Bo9&}@QGB$+B-p)? zF^48zChmcq-NqH-2&wlJ?ndhWtm>-Kb(*(w~V&srh0U9YOpeYJu}n zw1wFqTfGvb+`jy^X79quyg8=M%Bj(=@O9#Pq=AxK(7J1ffpW}`+f_p+&$bieMu)xi z^us!y*&3av=@WZ*yjNoDc6vUgId=@^!$!|9(2`e|q1T5Cbtp1hchYTior5q)ozq(r zOxP(SN5({pwvGyl4+~pV_xEO=v;sgcSs@9X7s5PyeUH@2phWMSk87P_x7lSJ&gy8bhi~rzme-}~yhr#`0 z3FSjf3%GuU9paiznX#JhjQ+7uM|_|D8bTD92Ahr~-P+csfsKS3z1QIt+HpYYLc<+3 zc!Cz zpDJ>vn)^>v^2GH&4lm{Dh9Za)2}u8e3}`!$$|MTZMllU$!do{xoGvcrI) zo-Je7WvHnIDQ>?%bh-SRF&S96pp?#{imwxbF>^uViPO;Ia#EJt*wic)l05V_L|NDR z>aH-g-+({#V#?gfxd{uzuvsBgbnC+$bKjOiwRIG^wl_npcl&I9}C zOE}~`@Y+0VuxN^fG9t-~(0}%8NIVB6Z^J0@c<+5#07`_*y5-F{W`SL@3geGawG7@t z-`VJv!*fr-u9C{Fs0>H2kzs{9?;c=KVA~>S#%pU$GqgVdaa_Z!(MXmU*ZvyI4H=PN zWTh^8-5Q774pDjYB^AY@T;9|UPrA0~7h8jDA?gT?#^C1!GyA1O)B^f@TXMI9SiYafDCUDz zhRn_v92hr?(bHsHSFSW^h_CK%NNnn&6S|nyB!L2AGi4n7ssuepZV`n&7|@q_Rz^;_ zG;b8Y50IGkU`5AY9ReTxH>*^&h$g8{uh(KAv;Id(jDQib!$mik=$=j#OFKh4a z^J?lC_go6Rb zjd;yAFJ9kL;`w2|)!aD~rMX=54Ije_+!f(LYe31t`$bXXw>!R{jVO_Dc@HDq1L&@bd!v7H@MKr!H zDA000*2WcNQMZXW$g^eE%NR`Ue!1@@IVandRQ%1)<-HPv=ALjNkhb{j6B61FGiauq zPgFfnLQ^nwL6F_##R4i+Qu+SK7m1T~)QmJ2{rk<7F>yJgnud+rRj+TN68P|f0j{%{ zBP3IGFYrsKUk1;;wFuR8Oc~=GFcX+Jq^_4l435Dkfq&bBB7DdRV6vn$kl=+wmBFbO zLFZix!RN7ohN7xZQp_d~(f$;j2g`^^A7zu+GJ+qA2kq(vA_a-Kg{uKu&?@pSHI*)X zdMA<0L038W4{h#fh&GE5ppEeJ1_jVl=RR7Lp0obd^~}72p3v)WcUuVR?}&J&Tj0ER z=+MxkA)$qp&PLym*5-C%HP^xy$D9Mrip?0EgKGg{lX1hFUL*cu*6hB#c2hg z2(-hdUoy_theuPJ9(L_iH(2>Ge@U=^V?1IKren;)6 zj@V~>TvDNBsXru-WKV|CnA*A8W9^^HR)|tM%n02B(O!Pbw%3)F<2ckaTuv+TJ{Ie3 zz&hr8Ym_sAtv-}kJ})@oCH5N5b&be`X2Tit@*!PQSlZx4%~tEF*v){kuiOns zO=Whwk4G8^uORjKL8AfY<+Ui>Q=qc2>o`1T&dlJK;13kOC8M*JmZVbLJ6$SSG&@T8cr_bBaV;PkiW` zm)*LNxJ{q%f&85e)T4U_mw=fN*8eb}f0Kc~dC;$v;D2F4BY({#O5FNLq_m~Z1&47Q zO9t5%_y|iCSyZ{licfWLZ}bNsC9^x8^2*J;KJF&Pjz51ZJjS5Eoj~?s2C|~Z-P5mP zl><-HnCT-&dhGjyMk@VrUSf>+w4vc%Z*p~}_!zZaHpvenrRlRlEu~9F52nH`hSfgU z==Wb>xmqC~dv0cou`^qnwSb^d$PbJskO}XUaXWc62|qj>S&?SIS`%qM&e^X0?)^J< z|L|$B=XZzzFbn)S&FE-u05maHb^`vzhyH0^s4Yp)ZblfnbN!a?b88I}K^m+KyL@6q z5C`p~{N~h9(@=PC@@n?<`GG?VrEdSeUu1%K_15L7_2M1_bAuQf5peB&mI<@MC2|;1 z8A?OVk!hWq!$%#pmj$;Fl<6%}jzqkp)tdrP8pA$G&J85;z;5|#ev^nd)pGmq>DA0P3 zil)|}wzb4|JZpK%pn}vh_@Rqb5_msmFIIrakL2#UA=ITO)Xx;HUwLGn7XDrq*ne1} zzaaRrr)Wiw_}inAINsp(?S{D+F;?yx5ly4#hC4j&IM5AGLVdeWgYQVoYzzMTUgBJ!n$XCEa#i`uI7gItobHiUs9h~%^vci)*x<0 z`$B2_)w*(dg-LO;BXW|Ys=KHZ(c@Q#gc=I|t&+EikaL*&FKN+Ucqs6Z*gNw}ymuS{ zqM2Ky5tAb3M@GjhbHE1e$B$P)9UjhHm{v!lk~BGC=ty%)4GY6!rzZK=%ZI)~XwYa>MY>ZWC|@Q5o#spDphB&pc#hRF7CuFHr(u8n6kA+) z-L4i*;+$BLiLKrvucA2e0@Up}@q<4BZWv@^N|&4$D-zF0PGjfxE7Kv@?*#7>O}^h* zy=|(3zdT1}*zn98Pz;jK6k_%6kNf=bI;?@GkDI(m(KY!aDLOTShzvz(6^K&G)R?0C zhs>+OBK&6{86BPVBT0Srt~yT{vLqfcOD3dTJc!-=lGvbWF=3BR<5#AvahOQ;DhLdoJ|5w!Dm74Ynx~@2il6WXNYwG9 zc&OV=F$A&eN>r0R)W?~v1@nHs(OluvoKC;^7J;O9J>bdQ-0aHX9!b#WuW}I(MC6?; zFo)QfAfSXMC_-TtW{Kh%-kP6yv*}3AZNcod<}zL7-@F+fBBNBQi$Zx;hpiIu)=D>0 znmxOxe>`aZRtavK$(%#Pu+-&rQ)W0lRSTinHi&m`tU~#;>*-wyD8P2OX^|*&{_vIT zhgEV);UTXa6D<(6Q%8yT!pr4-8Dl0tSH5Vcl^Umu2u)P)#NPC_obQL`d>QpL@vn<{ zw+WT+woof~nA=gv-->6=-2kN>r@a(|pKQb#m1i1o*WY=I071pqg-jCBv5aqu!iCPuy zAOFtERsNk)0quQZ$wAP}(%5)eEa{$fix|~wLP`s}u>1jUn%AWgOYSdqggts-Koy;w z$4<1cjK?Wgi=efuL60~yZ|+zcM{fLSlh?!E*`5hr_ln}zlEz$8q$nwKH_bE5P{j9K zxZ5p+ZF1(5Gp#uB^ldbIH(btalsqSjAwdTS$5Y2|wnGdqY~N&?Nxe=vC|8BW{p1|x zW|wc#wmcPzzK9l>=Q0J%Jeb>gEW2Pi5b1TDZe2TWluucpMiMq-i#<-I|9szIdmLO@ z+1%Sm^q}c-bu(8D#a?2aXJ!t;UTXGcqG7SW2iU&qL}|Xx_UdM&I4zqC?!vu*qsp@_ zSZmmWuUA21+$n6mTD!&Uvw@2?r7QXNHqRi%{B%*v-o`uj!pnnQ%$mu=*g^WM~ zsqxg?x0q6_xNuGUte^Q`USbX!NV{G+8|yDDOgz5)h%+ceyn(fem`}#el3}3VIM{g> z?f;Des>cR4L|wnD2pO5V>j+PuNqn0CAA(?l06(i!Jhm^}zmGym;UjkESMnF1X~?nJ zS=(Y`-AZ!KWfV@!8*Z1+&J^V3*VW4J>U|!;#i%Er=cm(&?C0m_@3?A}FdjXPRD;YZ z{_&CNdWtHdnrhOj>Jy{XJbYjBt1G-W15DCgkNANPn~4nScI%#3KNefivfI7ypC0$J zJD#Us?KuGI9C6*maoJgDbzux7lX$guA;qG;&h)WT^?(jPpN&wS2viKjtXAot(yjGd zBuzT5Ud%juO!bxF?Iqq3d1gPhm#En;-Z_W1UO|a+-MrV_MFO4Mnk}+|;td*yf)*dY zB_ZO-7(_cu`tqS(*^Q!Vu?Bu`AC0cxWci?Bb7?33%xiV$JTBj!KYFvR(Kqp#uWGroc_TnC8*>Zouo_h7b74DSqQi79MB&uuQ+Q=WSf_sa(nSE>BBS~2 zOW(+hUD`AU^(ooI!SkScI8mXLF_MXwK2|e|-2424GuDjxxh9n9 zY3phnHWLL;WR%ClN~E8uh5d~jPJMK-E%xd!^dp=*cs;kS4*BBq(1^CVcvtGRTE9Q0 zTHmIzm9@#UruHp;As%6#50P`T=?3Oy@z!{+tJHAfO(;yCq$#(5p-wBH-pW-`pKkYF zVy^`u`IHOTHLTq9+GNe$c9WIjn5ekN>5g))E!;Kqn9l{{4b5zm5c>A+@DV(f4yP>Q zJMp*#Wb-^1EXKXIgi#w)ScSrtU&+R|Zra~iOCy9cGi z3*kuIK{w2qHwku`$9f;z*kh=DR5K_IT*H+?S-2BBpyf>_dBlC4{O%qV?=Z7+kVzz` z3!`Z0YU+xB-9uLIkbutXyQ`vb3cM`FF2!%RQ?4-=b5#BD%(c03S-A6SS?mRUtX;nS z-IW$f$Ljv=mG2hMf3&MyjvUo(zB}z`2W>h_83f*?B9Zmn$a^<`;||3gSqXA{Sf>|5 zE&oBm;(8h!a~XhlYh%1VdzSE8m?CgzG-Qac=0>(jMSDj^&#`PGNhtW7<2e8dhjiG} z_b3VKshM5Jts_iNRC7?$aU z8b>zTJq_Q;mJ7scqQLbq+U#2dg!gU6gwJuNk2UcDQPt{#JT9-!h1T=ijp)=GGO6j; z=@^kOk92U^oUNobw}b{X+;0`oQ=JdGkAK8N;zsVx-z1pGw@&xvGnF)evg2n7;ApNB zd=C}a1yJ>pqivn%Ce%x#nqA#n#AB6nEvAB|8I-@X2c@itJ(aMf^%Cxjzuw^QyfieE z?43iK8<~GNK-M{PdbjzN9b{rQS7WbvFeF#A^GrYf?4kR;=w#2jV~?|aQ_k&uiy@Qp zLffu#7QyqOf_L2vhTx-g zuU)c(JgHs>R#k%jnbWnS#$LiC^dD3DuFIR8pTNTb7I1a+uO3)S%+A)y#MVhq#ogY- zQRkmV1LcWBw&1}}$fmV2UTKiU2vFKaK09o9a zj^}KG9^1Hq1WAJEfr z$&2L^V8cJk5r~kqr)8L@Ral^IyR*1al*FOtf9J2nP>^IF+%I&RdK26Or`Nl+|Hv|- z^}(prc}V$?Mb0Y$nis0T2BINO$9P`IG>Kee;tS0DP06A`0Y!)XMt5@kv^!L;Ny^!l zfgzn!EOp(;tMXM0`y5 zqq(`7>1~{o1HZ!3*1-5-nJBHdedW#~?;#L(v|Pc%DVB{*B`RP%@r9Chwz4StaCp!z zNx>4`<6c(JmwR;_f|nR@N(kjr;Hnyz$@aZK>3d5#l>A%ho$8)-+*!cA1MEDwU=;4V zq~qS`jew@`3HR9E<%dW}kF#5YFQa2$+&qD%(s~G5#og!ik8u1M9-bbJLNR%hL&>v3 zozkaZDIf%-Aoy|qcl|W@=iffRsHy)n{9R!Ds}(*tYyLxS{HOWv65U_Tali`Bf1CeH zu=}U=?~>SGt+&9|e+Xp%H2z&G`l~TKc$fn={s-acpE$qkD1XIa2eTG1&Y!BvKLLJM z&HM@wg7j}M{zF6aC(7^hH@~8Y;{1v7UuSdv1pIwYl42M%8>m8{EuqIp9sH?WPe34r1%@*-{aapO@HU3znacd|1|xR mo&E{%dw2RPfCdBLzjdyPvM^wcE_m1t{+2)i0D_Ev9sM6!b65-j literal 0 HcmV?d00001 diff --git a/requirements/base.txt b/requirements/base.txt index cfc1b241da..3679dd89b2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -12,7 +12,7 @@ certifi==2023.7.22 # requests chardet==5.2.0 # via -r requirements/base.in -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via requests click==8.1.7 # via nltk @@ -40,7 +40,7 @@ numpy==1.24.4 # via # -c requirements/constraints.in # -r requirements/base.in -packaging==23.1 +packaging==23.2 # via marshmallow python-iso639==2023.6.15 # via -r requirements/base.in diff --git a/requirements/build.txt b/requirements/build.txt index 99386eb5a7..2373ecfb3e 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -18,7 +18,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # -r requirements/build.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -54,7 +54,7 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==2.0.0 # via -r requirements/build.in -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # sphinx diff --git a/requirements/dev.txt b/requirements/dev.txt index b90b4776d6..e83859d7dd 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -13,21 +13,18 @@ appnope==0.1.3 # ipykernel # ipython argon2-cffi==23.1.0 - # via jupyter-server + # via + # jupyter-server + # nbclassic + # notebook argon2-cffi-bindings==21.2.0 # via argon2-cffi -arrow==1.2.3 - # via isoduration asttokens==2.4.0 # via stack-data -async-lru==2.0.4 - # via jupyterlab attrs==23.1.0 # via # jsonschema # referencing -babel==2.12.1 - # via jupyterlab-server backcall==0.2.0 # via ipython beautifulsoup4==4.12.2 @@ -36,23 +33,10 @@ beautifulsoup4==4.12.2 # nbconvert bleach==6.0.0 # via nbconvert -build==1.0.3 - # via pip-tools -certifi==2023.7.22 - # via - # -c requirements/base.txt - # -c requirements/constraints.in - # -c requirements/test.txt - # requests cffi==1.16.0 # via argon2-cffi-bindings cfgv==3.4.0 # via pre-commit -charset-normalizer==3.2.0 - # via - # -c requirements/base.txt - # -c requirements/test.txt - # requests click==8.1.7 # via # -c requirements/base.txt @@ -70,47 +54,40 @@ defusedxml==0.7.1 # via nbconvert distlib==0.3.7 # via virtualenv +entrypoints==0.4 + # via + # jupyter-client + # nbconvert exceptiongroup==1.1.3 # via # -c requirements/test.txt # anyio -executing==1.2.0 +executing==2.0.0 # via stack-data -fastjsonschema==2.18.0 +fastjsonschema==2.18.1 # via nbformat filelock==3.12.4 # via virtualenv -fqdn==1.5.1 - # via jsonschema -identify==2.5.29 +identify==2.5.30 # via pre-commit idna==3.4 # via # -c requirements/base.txt # -c requirements/test.txt # anyio - # jsonschema - # requests -importlib-metadata==6.8.0 - # via - # build - # jupyter-client - # jupyter-lsp - # jupyterlab - # jupyterlab-server - # nbconvert importlib-resources==6.1.0 # via # jsonschema # jsonschema-specifications - # jupyterlab -ipykernel==6.25.2 + # notebook +ipykernel==6.11.0 # via # jupyter # jupyter-console - # jupyterlab + # nbclassic + # notebook # qtconsole -ipython==8.12.2 +ipython==8.12.3 # via # -c requirements/constraints.in # -r requirements/dev.in @@ -118,74 +95,55 @@ ipython==8.12.2 # ipywidgets # jupyter-console ipython-genutils==0.2.0 - # via qtconsole + # via + # jupyter-server + # nbclassic + # notebook + # qtconsole ipywidgets==8.1.1 # via jupyter -isoduration==20.11.0 - # via jsonschema jedi==0.19.0 # via ipython jinja2==3.1.2 # via # jupyter-server - # jupyterlab - # jupyterlab-server + # nbclassic # nbconvert -json5==0.9.14 - # via jupyterlab-server -jsonpointer==2.4 - # via jsonschema -jsonschema[format-nongpl]==4.19.1 - # via - # jupyter-events - # jupyterlab-server - # nbformat + # notebook +jsonschema==4.19.1 + # via nbformat jsonschema-specifications==2023.7.1 # via jsonschema jupyter==1.0.0 # via -r requirements/dev.in -jupyter-client==8.3.1 +jupyter-client==7.4.9 # via # ipykernel # jupyter-console # jupyter-server + # nbclassic # nbclient + # notebook # qtconsole -jupyter-console==6.6.3 +jupyter-console==6.4.4 # via jupyter jupyter-core==5.3.2 # via # -c requirements/constraints.in # ipykernel # jupyter-client - # jupyter-console # jupyter-server - # jupyterlab - # nbclient + # nbclassic # nbconvert # nbformat + # notebook # qtconsole -jupyter-events==0.7.0 - # via jupyter-server -jupyter-lsp==2.2.0 - # via jupyterlab -jupyter-server==2.7.3 +jupyter-server==1.13.1 # via - # jupyter-lsp - # jupyterlab - # jupyterlab-server - # notebook + # nbclassic # notebook-shim -jupyter-server-terminals==0.4.4 - # via jupyter-server -jupyterlab==4.0.6 - # via notebook jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.25.0 - # via - # jupyterlab - # notebook jupyterlab-widgets==3.0.9 # via ipywidgets markupsafe==2.1.3 @@ -196,52 +154,53 @@ matplotlib-inline==0.1.6 # via # ipykernel # ipython -mistune==3.0.1 +mistune==0.8.4 # via nbconvert -nbclient==0.8.0 +nbclassic==1.0.0 + # via notebook +nbclient==0.5.13 # via nbconvert -nbconvert==7.8.0 +nbconvert==6.4.5 # via # jupyter # jupyter-server + # nbclassic + # notebook nbformat==5.9.2 # via # jupyter-server + # nbclassic # nbclient # nbconvert + # notebook nest-asyncio==1.5.8 - # via ipykernel + # via + # ipykernel + # jupyter-client + # nbclassic + # nbclient + # notebook nodeenv==1.8.0 # via pre-commit -notebook==7.0.4 +notebook==6.5.6 # via jupyter notebook-shim==0.2.3 # via - # jupyterlab + # nbclassic # notebook -overrides==7.4.0 - # via jupyter-server -packaging==23.1 - # via - # -c requirements/base.txt - # -c requirements/test.txt - # build - # ipykernel - # jupyter-server - # jupyterlab - # jupyterlab-server - # nbconvert - # qtconsole - # qtpy pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi +pep517==0.13.0 + # via + # build + # pip-tools pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==7.3.0 +pip-tools==6.6.2 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema @@ -253,7 +212,10 @@ platformdirs==3.10.0 pre-commit==3.4.0 # via -r requirements/dev.in prometheus-client==0.17.1 - # via jupyter-server + # via + # jupyter-server + # nbclassic + # notebook prompt-toolkit==3.0.39 # via # ipython @@ -274,57 +236,38 @@ pygments==2.16.1 # jupyter-console # nbconvert # qtconsole -pyproject-hooks==1.0.0 - # via build python-dateutil==2.8.2 # via # -c requirements/test.txt - # arrow # jupyter-client -python-json-logger==2.0.7 - # via jupyter-events -pytz==2023.3.post1 - # via babel pyyaml==6.0.1 # via # -c requirements/test.txt - # jupyter-events # pre-commit -pyzmq==25.1.1 +pyzmq==24.0.1 # via - # ipykernel # jupyter-client - # jupyter-console # jupyter-server + # nbclassic + # notebook # qtconsole -qtconsole==5.4.4 +qtconsole==5.2.2 # via jupyter -qtpy==2.4.0 +qtpy==1.11.3 # via qtconsole referencing==0.30.2 # via # jsonschema # jsonschema-specifications - # jupyter-events -requests==2.31.0 - # via - # -c requirements/base.txt - # -c requirements/test.txt - # jupyterlab-server -rfc3339-validator==0.1.4 - # via - # jsonschema - # jupyter-events -rfc3986-validator==0.1.1 - # via - # jsonschema - # jupyter-events rpds-py==0.10.3 # via # jsonschema # referencing send2trash==1.8.2 - # via jupyter-server + # via + # jupyter-server + # nbclassic + # notebook six==1.16.0 # via # -c requirements/base.txt @@ -332,34 +275,32 @@ six==1.16.0 # asttokens # bleach # python-dateutil - # rfc3339-validator sniffio==1.3.0 # via anyio soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -stack-data==0.6.2 +stack-data==0.6.3 # via ipython terminado==0.17.1 # via # jupyter-server - # jupyter-server-terminals -tinycss2==1.2.1 + # nbclassic + # notebook +testpath==0.6.0 # via nbconvert tomli==2.0.1 # via # -c requirements/test.txt - # build - # jupyterlab + # pep517 # pip-tools - # pyproject-hooks tornado==6.3.3 # via # ipykernel # jupyter-client # jupyter-server - # jupyterlab + # nbclassic # notebook # terminado traitlets==5.10.1 @@ -369,40 +310,26 @@ traitlets==5.10.1 # ipython # ipywidgets # jupyter-client - # jupyter-console # jupyter-core - # jupyter-events # jupyter-server - # jupyterlab # matplotlib-inline + # nbclassic # nbclient # nbconvert # nbformat + # notebook # qtconsole typing-extensions==4.8.0 # via # -c requirements/base.txt # -c requirements/test.txt - # async-lru # ipython -uri-template==1.3.0 - # via jsonschema -urllib3==1.26.16 - # via - # -c requirements/base.txt - # -c requirements/constraints.in - # -c requirements/test.txt - # requests virtualenv==20.24.5 # via pre-commit -wcwidth==0.2.7 +wcwidth==0.2.8 # via prompt-toolkit -webcolors==1.13 - # via jsonschema webencodings==0.5.1 - # via - # bleach - # tinycss2 + # via bleach websocket-client==1.6.3 # via jupyter-server wheel==0.41.2 @@ -412,9 +339,7 @@ wheel==0.41.2 widgetsnbextension==4.0.9 # via ipywidgets zipp==3.17.0 - # via - # importlib-metadata - # importlib-resources + # via importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 1f028530d2..7a3fc605d0 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -19,7 +19,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -39,13 +39,13 @@ cython==3.0.2 # via unstructured-paddleocr et-xmlfile==1.1.0 # via openpyxl -flask==2.3.3 +flask==3.0.0 # via # flask-babel # visualdl flask-babel==3.1.0 # via visualdl -fonttools==4.42.1 +fonttools==4.43.0 # via matplotlib future==0.18.3 # via bce-python-sdk @@ -122,7 +122,7 @@ opencv-python==4.8.0.76 # unstructured-paddleocr openpyxl==3.1.2 # via unstructured-paddleocr -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # matplotlib @@ -218,7 +218,7 @@ urllib3==1.26.16 # requests visualdl==2.5.3 # via unstructured-paddleocr -werkzeug==2.3.7 +werkzeug==3.0.0 # via flask zipp==3.17.0 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 679ffef54c..140eaa0a24 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -13,7 +13,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # pdfminer-six @@ -35,7 +35,7 @@ filelock==3.12.4 # transformers flatbuffers==23.5.26 # via onnxruntime -fonttools==4.42.1 +fonttools==4.43.0 # via matplotlib fsspec==2023.9.1 # via @@ -98,7 +98,7 @@ opencv-python==4.8.0.76 # -c requirements/constraints.in # layoutparser # unstructured-inference -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # huggingface-hub diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 9e30fd6f95..a0d861f1e5 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -10,5 +10,5 @@ pillow==10.0.1 # via python-pptx python-pptx==0.6.21 # via -r requirements/extra-pptx.in -xlsxwriter==3.1.5 +xlsxwriter==3.1.6 # via python-pptx diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 00ba71293a..2fc6f0efb9 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -53,7 +53,7 @@ numpy==1.24.4 # -c requirements/base.txt # -c requirements/constraints.in # transformers -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # huggingface-hub diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index db7e92a6a1..26744992b7 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt index 19e918acb6..763e3a14c1 100644 --- a/requirements/ingest-azure-cognitive-search.txt +++ b/requirements/ingest-azure-cognitive-search.txt @@ -18,7 +18,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # msrest # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index e682d29422..55370e769f 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -34,7 +34,7 @@ cffi==1.16.0 # via # azure-datalake-store # cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # aiohttp @@ -61,7 +61,7 @@ idna==3.4 # yarl isodate==0.6.1 # via azure-storage-blob -msal==1.24.0 +msal==1.24.1 # via # azure-datalake-store # azure-identity diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index 79268b6b3d..5d61bfc721 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -17,7 +17,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index cbcf462cf4..0a36bb3cfa 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index 72d0250723..0b0800082e 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -12,7 +12,7 @@ async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # aiohttp diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index f9b094be42..56c77ff37e 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 4f6d048137..463dcbddff 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -25,7 +25,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # aiohttp diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index ad5ac2a7a0..1c649274c5 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -13,7 +13,7 @@ cffi==1.16.0 # via # cryptography # pynacl -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -27,20 +27,31 @@ idna==3.4 # requests pycparser==2.21 # via cffi -pygithub==1.59.1 +pygithub==2.1.1 # via -r requirements/ingest-github.in pyjwt[crypto]==2.8.0 # via pygithub pynacl==1.5.0 # via pygithub +python-dateutil==2.8.2 + # via pygithub requests==2.31.0 # via # -c requirements/base.txt # pygithub +six==1.16.0 + # via + # -c requirements/base.txt + # python-dateutil +typing-extensions==4.8.0 + # via + # -c requirements/base.txt + # pygithub urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in + # pygithub # requests wrapt==1.15.0 # via deprecated diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index dbff64042c..4d45eeda5c 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 9f90bcc9ca..ace1ff7fa2 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt index adb8173b2c..e53d3dc493 100644 --- a/requirements/ingest-jira.txt +++ b/requirements/ingest-jira.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index 2d9627f1d4..99e838c70b 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -17,7 +17,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -29,7 +29,7 @@ idna==3.4 # via # -c requirements/base.txt # requests -msal==1.24.0 +msal==1.24.1 # via # -r requirements/ingest-onedrive.in # office365-rest-python-client diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index d7846c0a08..d0562f7058 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -25,7 +25,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # aiohttp @@ -50,7 +50,7 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.304 +langchain==0.0.305 # via -r requirements/ingest-openai.in langsmith==0.0.41 # via langchain @@ -76,7 +76,7 @@ numpy==1.24.4 # numexpr openai==0.28.1 # via -r requirements/ingest-openai.in -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # marshmallow diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index ccef36d349..9ca3f43a72 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -23,7 +23,7 @@ idna==3.4 # via # -c requirements/base.txt # requests -msal==1.24.0 +msal==1.24.1 # via # -r requirements/ingest-outlook.in # office365-rest-python-client diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 8c0b53b240..7e19fdb9f4 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index d15a09eba9..b86dfe415b 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -20,7 +20,7 @@ attrs==23.1.0 # via aiohttp botocore==1.31.17 # via aiobotocore -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # aiohttp diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index a6c31b1014..817921dd71 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -13,7 +13,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 99d1efbfde..97cae3dd91 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # requests cffi==1.16.0 # via cryptography -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -23,7 +23,7 @@ idna==3.4 # via # -c requirements/base.txt # requests -msal==1.24.0 +msal==1.24.1 # via # -r requirements/ingest-sharepoint.in # office365-rest-python-client diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index e391f0156e..ec1add403a 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -13,7 +13,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests diff --git a/requirements/test.txt b/requirements/test.txt index 98d40fd188..b4e48463db 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -13,7 +13,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -charset-normalizer==3.2.0 +charset-normalizer==3.3.0 # via # -c requirements/base.txt # requests @@ -32,7 +32,7 @@ flake8==6.1.0 # via -r requirements/test.in freezegun==1.2.2 # via -r requirements/test.in -grpcio==1.58.0 +grpcio==1.59.0 # via -r requirements/test.in idna==3.4 # via @@ -61,7 +61,7 @@ mypy-extensions==1.0.0 # -c requirements/base.txt # black # mypy -packaging==23.1 +packaging==23.2 # via # -c requirements/base.txt # black diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index e8404537c1..a141f26ee0 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -8,7 +8,7 @@ import docx import pytest -from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT, EXPECTED_TITLE from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( @@ -708,36 +708,51 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook. def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"): elements = partition(filename=filename, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert sum(isinstance(element, Title) for element in elements) == 2 + assert len(elements) == 4 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE - assert elements[0].metadata.page_number == 1 - assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.text_as_html == EXPECTED_TABLE + assert elements[1].metadata.page_number == 1 + assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: elements = partition(file=f, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert sum(isinstance(element, Title) for element in elements) == 2 + assert len(elements) == 4 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE - assert elements[0].metadata.page_number == 1 - assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.text_as_html == EXPECTED_TABLE + assert elements[1].metadata.page_number == 1 + assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE -EXPECTED_XLS_TEXT_LEN = 507 +EXPECTED_XLS_TEXT_LEN = 550 -EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MA What C datatypes are 8 bits? (assume i386)" +EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What" EXPECTED_XLS_TABLE = ( """ + + + + + + + + + + + @@ -814,8 +829,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx" def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"): elements = partition(filename=filename, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 3 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 18 assert clean_extra_whitespace(elements[0].text)[:45] == EXPECTED_XLS_INITIAL_45_CLEAN_TEXT # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py index f9f04baad0..3dd978d86a 100644 --- a/test_unstructured/partition/test_constants.py +++ b/test_unstructured/partition/test_constants.py @@ -23,6 +23,7 @@
MCWhat is 2+2?4correct3incorrect
MA What C datatypes are 8 bits? (assume i386)
""" +EXPECTED_TITLE = "Stanley Cups" EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" diff --git a/test_unstructured/partition/xlsx/test_xlsx.py b/test_unstructured/partition/xlsx/test_xlsx.py index a541fe5566..b3e564bc9d 100644 --- a/test_unstructured/partition/xlsx/test_xlsx.py +++ b/test_unstructured/partition/xlsx/test_xlsx.py @@ -1,6 +1,8 @@ -from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT +import pytest + +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT, EXPECTED_TITLE from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.elements import Table +from unstructured.documents.elements import Table, Text, Title from unstructured.partition.json import partition_json from unstructured.partition.xlsx import partition_xlsx from unstructured.staging.base import elements_to_json @@ -13,20 +15,21 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"): elements = partition_xlsx(filename=filename, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 4 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE - assert elements[0].metadata.page_number == 1 - assert elements[0].metadata.filetype == EXPECTED_FILETYPE - assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME - assert elements[0].metadata.filename == "stanley-cups.xlsx" + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.text_as_html == EXPECTED_TABLE + assert elements[1].metadata.page_number == 1 + assert elements[1].metadata.filetype == EXPECTED_FILETYPE + assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME + assert elements[1].metadata.filename == "stanley-cups.xlsx" def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"): elements = partition_xlsx(filename=filename, include_header=False) - assert all(isinstance(element, Table) for element in elements) + assert sum(isinstance(element, Text) for element in elements) == 1 assert len(elements) == 1 assert clean_extra_whitespace(elements[0].text) == "🤠😅" @@ -36,14 +39,16 @@ def test_partition_xlsx_from_filename_with_metadata_filename( ): elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert sum(isinstance(element, Table) for element in elements) == 2 + assert sum(isinstance(element, Title) for element in elements) == 2 + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT assert elements[0].metadata.filename == "test" def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"): elements = partition_xlsx(filename=filename, include_header=True) - assert all(isinstance(element, Table) for element in elements) + assert sum(isinstance(element, Table) for element in elements) == 2 assert len(elements) == 2 assert ( clean_extra_whitespace(elements[0].text) @@ -56,30 +61,31 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: elements = partition_xlsx(file=f, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_TABLE - assert elements[0].metadata.page_number == 1 - assert elements[0].metadata.filetype == EXPECTED_FILETYPE - assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME - assert elements[0].metadata.filename is None + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 4 + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.text_as_html == EXPECTED_TABLE + assert elements[1].metadata.page_number == 1 + assert elements[1].metadata.filetype == EXPECTED_FILETYPE + assert elements[1].metadata.page_name == EXCEPTED_PAGE_NAME + assert elements[1].metadata.filename is None def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: elements = partition_xlsx(file=f, metadata_filename="test", include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.filename == "test" + assert sum(isinstance(element, Table) for element in elements) == 2 + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.filename == "test" def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: elements = partition_xlsx(file=f, include_header=True) - assert all(isinstance(element, Table) for element in elements) + assert sum(isinstance(element, Table) for element in elements) == 2 assert len(elements) == 2 assert ( clean_extra_whitespace(elements[0].text) @@ -91,25 +97,27 @@ def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cup def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"): elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 4 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT - assert elements[0].metadata.text_as_html is None - assert elements[0].metadata.page_number is None - assert elements[0].metadata.filetype is None - assert elements[0].metadata.page_name is None - assert elements[0].metadata.filename is None + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT + assert elements[1].metadata.text_as_html is None + assert elements[1].metadata.page_number is None + assert elements[1].metadata.filetype is None + assert elements[1].metadata.page_name is None + assert elements[1].metadata.filename is None def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: elements = partition_xlsx(file=f, include_metadata=False, include_header=False) - assert all(isinstance(element, Table) for element in elements) - assert len(elements) == 2 + assert sum(isinstance(element, Table) for element in elements) == 2 + assert sum(isinstance(element, Title) for element in elements) == 2 + assert len(elements) == 4 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TITLE + assert clean_extra_whitespace(elements[1].text) == EXPECTED_TEXT assert elements[0].metadata.text_as_html is None assert elements[0].metadata.page_number is None assert elements[0].metadata.filetype is None @@ -205,3 +213,19 @@ def test_partition_xlsx_with_json(filename="example-docs/stanley-cups.xlsx"): for i in range(len(elements)): assert elements[i] == test_elements[i] + + +@pytest.mark.skip("Needs to fix language detection for table. Currently detected as 'tur'") +def test_partition_xlsx_metadata_language_from_filename(filename="example-docs/stanley-cups.xlsx"): + elements = partition_xlsx(filename=filename, include_header=False) + + assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 4 + + assert elements[0].metadata.languages == ["eng"] + + +def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"): + elements = partition_xlsx(filename) + assert sum(isinstance(element, Table) for element in elements) == 3 + assert len(elements) == 6 diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json index 596ff5e799..99079673a7 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.xlsx.json @@ -1,7 +1,32 @@ [ + { + "type": "Title", + "element_id": "c37e2cb941a2e20a9f728fbea5f9e400", + "metadata": { + "data_source": { + "url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d", + "version": 1, + "record_locator": { + "server_path": "/Shared Documents/stanley-cups.xlsx", + "site_url": "https://unstructuredio.sharepoint.com" + }, + "date_created": "2023-06-16T05:05:05", + "date_modified": "2023-06-16T05:05:05" + }, + "filename": "stanley-cups.xlsx", + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], + "page_number": 1, + "page_name": "Stanley Cups", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "Stanley Cups" + }, { "type": "Table", - "element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6", + "element_id": "c00fc0e5ac303c40f9089791e5e485b1", "metadata": { "data_source": { "url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d", @@ -15,15 +40,43 @@ }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], "page_number": 1, "page_name": "Stanley Cups", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley CupsUnnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + }, + { + "type": "Title", + "element_id": "98656277bdadc9ef7d1a9e1bc969579b", + "metadata": { + "data_source": { + "url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d", + "version": 1, + "record_locator": { + "server_path": "/Shared Documents/stanley-cups.xlsx", + "site_url": "https://unstructuredio.sharepoint.com" + }, + "date_created": "2023-06-16T05:05:05", + "date_modified": "2023-06-16T05:05:05" + }, + "filename": "stanley-cups.xlsx", + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], + "page_number": 2, + "page_name": "Stanley Cups Since 67", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + "text": "Stanley Cups Since 67" }, { "type": "Table", - "element_id": "0699dddf33814117e04654068f5182f6", + "element_id": "31421b5cd94fedb10dc82738503b4505", "metadata": { "data_source": { "url": "https://unstructuredio.sharepoint.com/Shared Documents/stanley-cups.xlsx?d=wb9956a338079432191ea609def07394d", @@ -37,10 +90,13 @@ }, "filename": "stanley-cups.xlsx", "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], "page_number": 2, "page_name": "Stanley Cups Since 67", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley Cups Since 67Unnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json index c26b2cda9d..79cebfc45b 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -1,7 +1,31 @@ [ + { + "type": "Title", + "element_id": "c37e2cb941a2e20a9f728fbea5f9e400", + "metadata": { + "data_source": { + "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", + "version": "COul9MuE0/8CEAE=", + "record_locator": { + "protocol": "gs", + "remote_file_path": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx" + }, + "date_created": "2023-06-20T23:48:24.973000+00:00", + "date_modified": "2023-06-20T23:48:24.973000+00:00" + }, + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], + "page_number": 1, + "page_name": "Stanley Cups", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "Stanley Cups" + }, { "type": "Table", - "element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6", + "element_id": "c00fc0e5ac303c40f9089791e5e485b1", "metadata": { "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", @@ -14,15 +38,42 @@ "date_modified": "2023-06-20T23:48:24.973000+00:00" }, "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], "page_number": 1, "page_name": "Stanley Cups", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley CupsUnnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + }, + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + }, + { + "type": "Title", + "element_id": "98656277bdadc9ef7d1a9e1bc969579b", + "metadata": { + "data_source": { + "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", + "version": "COul9MuE0/8CEAE=", + "record_locator": { + "protocol": "gs", + "remote_file_path": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx" + }, + "date_created": "2023-06-20T23:48:24.973000+00:00", + "date_modified": "2023-06-20T23:48:24.973000+00:00" + }, + "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], + "page_number": 2, + "page_name": "Stanley Cups Since 67", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + "text": "Stanley Cups Since 67" }, { "type": "Table", - "element_id": "0699dddf33814117e04654068f5182f6", + "element_id": "31421b5cd94fedb10dc82738503b4505", "metadata": { "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", @@ -35,10 +86,13 @@ "date_modified": "2023-06-20T23:48:24.973000+00:00" }, "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "languages": [ + "tur" + ], "page_number": 2, "page_name": "Stanley Cups Since 67", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley Cups Since 67Unnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json index ee6e938c41..65d3b51c49 100644 --- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json @@ -1,7 +1,7 @@ [ { "type": "Table", - "element_id": "0e2d044a26942328e2b8647574232e7f", + "element_id": "a5c9668a6055bca2865ea5e6d16ea1e0", "metadata": { "data_source": { "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", @@ -14,15 +14,64 @@ }, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], "page_number": 1, "page_name": "Example Test", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MCWhat is 2+2?4correct3incorrectUnnamed: 6Unnamed: 7Unnamed: 8
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MCWhat is 2+2?4correct3incorrect
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" }, - "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\nUnnamed: 6\nUnnamed: 7\nUnnamed: 8\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n" + "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n" + }, + { + "type": "Title", + "element_id": "1d34c23ff08573afa07b42842b41277a", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 2, + "page_name": "Format Abbr." + }, + "text": "http://www.cmu.edu/blackboard" + }, + { + "type": "Title", + "element_id": "05440c6ca94cb55f6d185d8bd92ce9d6", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 2, + "page_name": "Format Abbr." + }, + "text": "Question Format Abbreviations" }, { "type": "Table", - "element_id": "5c56dd4c5b649b873ebd848312e66753", + "element_id": "e39c724f1b09a4c3286b6368538e05fc", "metadata": { "data_source": { "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", @@ -35,15 +84,18 @@ }, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], "page_number": 2, "page_name": "Format Abbr.", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0Unnamed: 1
http://www.cmu.edu/blackboard
Question Format Abbreviations
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
" }, - "text": "\n\n\nUnnamed: 0\nUnnamed: 1\n\n\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n" + "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n" }, { - "type": "Table", - "element_id": "f48657c4eb70d98975e567248d0ef4bb", + "type": "Title", + "element_id": "1d34c23ff08573afa07b42842b41277a", "metadata": { "data_source": { "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", @@ -56,10 +108,291 @@ }, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], "page_number": 3, - "page_name": "Readme", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0
http://www.cmu.edu/blackboard
File Information
Source
http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls
Version
1.0 (January 2012)
Contact
bb-help@andrew.cmu.edu
About
This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions
" + "page_name": "Readme" + }, + "text": "http://www.cmu.edu/blackboard" + }, + { + "type": "Title", + "element_id": "85ada878f2345c23b8a74a931d2e20a4", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "File Information" + }, + { + "type": "Title", + "element_id": "0e570ca6fabe24f94e52c1833f3ffd25", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "Source" + }, + { + "type": "Title", + "element_id": "4cf4ff5597274d0c1ce8ae5a17ead4df", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls" + }, + { + "type": "Title", + "element_id": "4cf4ff5597274d0c1ce8ae5a17ead4df", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel" + }, + "text": "http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls" + }, + { + "type": "Title", + "element_id": "dd167905de0defcaf72de673ee44c074", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "Version" + }, + { + "type": "UncategorizedText", + "element_id": "5f9d7b40d332fef76efdd0a97bcb8617", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "1.0 (January 2012)" + }, + { + "type": "UncategorizedText", + "element_id": "5f9d7b40d332fef76efdd0a97bcb8617", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel" + }, + "text": "1.0 (January 2012)" + }, + { + "type": "Title", + "element_id": "2b5c3d26721ae9c350cf3009318b626f", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "Contact" + }, + { + "type": "Title", + "element_id": "53d2273ac70fc31640cc45af840dbd42", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "bb-help@andrew.cmu.edu" + }, + { + "type": "Title", + "element_id": "53d2273ac70fc31640cc45af840dbd42", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel" + }, + "text": "bb-help@andrew.cmu.edu" + }, + { + "type": "Title", + "element_id": "4efca0d10c5feb8e9b35eb1d994f2905", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "About" + }, + { + "type": "NarrativeText", + "element_id": "4c9720f1540cc84d33e30e09aca8c077", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel", + "languages": [ + "eng" + ], + "page_number": 3, + "page_name": "Readme" + }, + "text": "This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions" + }, + { + "type": "NarrativeText", + "element_id": "4c9720f1540cc84d33e30e09aca8c077", + "metadata": { + "data_source": { + "url": "/drives/b!3vfYDk3GHEaRbo1pkhLPIRXZrzTLHCtCm5WV6KY1m_0-lOjrjQaAS6X30Pv_E4VX/root:/utic-test-ingest-fixtures/tests-example.xls", + "record_locator": { + "user_pname": "devops@unstructuredio.onmicrosoft.com", + "server_relative_path": "utic-test-ingest-fixtures/tests-example.xls" + }, + "date_created": "2023-08-24T03:00:43", + "date_modified": "2023-08-24T03:00:43" + }, + "filename": "tests-example.xls", + "filetype": "application/vnd.ms-excel" }, - "text": "\n\n\nUnnamed: 0\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n" + "text": "This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions" } ] \ No newline at end of file diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index ebffd6cdf9..871c002b3a 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -1,14 +1,20 @@ from tempfile import SpooledTemporaryFile -from typing import IO, BinaryIO, List, Optional, Union, cast +from typing import IO, Any, BinaryIO, Dict, List, Optional, Tuple, Union, cast +import numpy as np import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring from unstructured.chunking.title import add_chunking_strategy +from unstructured.cleaners.core import clean_bullets from unstructured.documents.elements import ( Element, ElementMetadata, + ListItem, + NarrativeText, Table, + Text, + Title, process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype @@ -18,6 +24,13 @@ get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) +from unstructured.partition.lang import detect_languages +from unstructured.partition.text_type import ( + is_bulleted_text, + is_possible_narrative_text, + is_possible_numbered_list, + is_possible_title, +) @process_metadata() @@ -28,8 +41,10 @@ def partition_xlsx( file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, + languages: List[str] = ["auto"], metadata_last_modified: Optional[str] = None, - include_header: bool = True, + include_header: bool = False, + find_subtable: bool = True, **kwargs, ) -> List[Element]: """Partitions Microsoft Excel Documents in .xlsx format into its document elements. @@ -42,43 +57,340 @@ def partition_xlsx( A file-like object using "rb" mode --> open(filename, "rb"). include_metadata Determines whether or not metadata is included in the output. + languages + The list of languages present in the document. metadata_last_modified The day of the last modification include_header Determines whether or not header info info is included in text and medatada.text_as_html """ exactly_one(filename=filename, file=file) + if not isinstance(languages, list): + raise TypeError( + 'The language parameter must be a list of language codes as strings, ex. ["eng"]', + ) last_modification_date = None + header = 0 if include_header else None + if filename: - sheets = pd.read_excel(filename, sheet_name=None) + sheets = pd.read_excel(filename, sheet_name=None, header=header) last_modification_date = get_last_modified_date(filename) elif file: f = spooled_to_bytes_io_if_needed( cast(Union[BinaryIO, SpooledTemporaryFile], file), ) - sheets = pd.read_excel(f, sheet_name=None) + sheets = pd.read_excel(f, sheet_name=None, header=header) last_modification_date = get_last_modified_date_from_file(file) elements: List[Element] = [] page_number = 0 - for sheet_name, table in sheets.items(): + for sheet_name, sheet in sheets.items(): page_number += 1 - html_text = table.to_html(index=False, header=include_header, na_rep="") - text = soupparser_fromstring(html_text).text_content() - - if include_metadata: - metadata = ElementMetadata( - text_as_html=html_text, - page_name=sheet_name, - page_number=page_number, - filename=metadata_filename or filename, - last_modified=metadata_last_modified or last_modification_date, - ) + if not find_subtable: + html_text = sheet.to_html(index=False, header=include_header, na_rep="") + text = soupparser_fromstring(html_text).text_content() + + if include_metadata: + metadata = ElementMetadata( + text_as_html=html_text, + page_name=sheet_name, + page_number=page_number, + filename=metadata_filename or filename, + last_modified=metadata_last_modified or last_modification_date, + ) + else: + metadata = ElementMetadata() + + table = Table(text=text, metadata=metadata) + elements.append(table) else: - metadata = ElementMetadata() + _connected_components = _get_connected_components(sheet) + for _connected_component, _min_max_coords in _connected_components: + min_x, min_y, max_x, max_y = _min_max_coords + + subtable = sheet.iloc[min_x : max_x + 1, min_y : max_y + 1] # noqa: E203 + single_non_empty_rows, single_non_empty_row_contents = _single_non_empty_rows( + subtable, + ) + ( + front_non_consecutive, + last_non_consecutive, + ) = _find_first_and_last_non_consecutive_row( + single_non_empty_rows, + subtable.shape, + ) + + metadata = _get_metadata( + include_metadata, + sheet_name, + page_number, + metadata_filename or filename, + metadata_last_modified or last_modification_date, + ) + + # NOTE(klaijan) - need to explicitly define the condition to avoid the case of 0 + if front_non_consecutive is not None and last_non_consecutive is not None: + first_row = int(front_non_consecutive - max_x) + last_row = int(max_x - last_non_consecutive) + subtable = _get_sub_subtable(subtable, (first_row, last_row)) - table = Table(text=text, metadata=metadata) - elements.append(table) + if front_non_consecutive is not None: + for content in single_non_empty_row_contents[: front_non_consecutive + 1]: + languages = detect_languages(str(content), languages) + element = _check_content_element_type(str(content)) + element.metadata = metadata + element.metadata.languages = languages + elements.append(element) + + if subtable is not None and len(subtable) == 1: + element = _check_content_element_type(str(subtable.iloc[0].values[0])) + elements.append(element) + + elif subtable is not None: + # parse subtables as html + html_text = subtable.to_html(index=False, header=include_header, na_rep="") + text = soupparser_fromstring(html_text).text_content() + languages = detect_languages(text, languages) + subtable = Table(text=text) + subtable.metadata = metadata + subtable.metadata.text_as_html = html_text + subtable.metadata.languages = languages + elements.append(subtable) + + if front_non_consecutive is not None and last_non_consecutive is not None: + for content in single_non_empty_row_contents[ + front_non_consecutive + 1 : # noqa: E203 + ]: + languages = detect_languages(str(content), languages) + element = _check_content_element_type(str(content)) + element.metadata = metadata + element.metadata.languages = languages + elements.append(element) return elements + + +def _get_connected_components( + sheet: pd.DataFrame, + filter: bool = True, +): + """ + Identify connected components of non-empty cells in an excel sheet. + + Args: + sheet: an excel sheet read in DataFrame. + filter (bool, optional): If True (default), filters out overlapping components + to return distinct components. + + Returns: + A list of tuples, each containing: + - A list of tuples representing the connected component's cell coordinates. + - A tuple with the min and max x and y coordinates bounding the connected component. + + Note: + This function performs a depth-first search (DFS) to identify connected components of + non-empty cells in the sheet. If 'filter' is set to True, it also filters out + overlapping components to return distinct components. + """ + max_row, max_col = sheet.shape + visited = set() + connected_components = [] + + def dfs(row, col, component): + if ( + row < 0 + or row >= sheet.shape[0] + or col < 0 + or col >= sheet.shape[1] + or (row, col) in visited + ): + return + visited.add((row, col)) + + if not pd.isna(sheet.iat[row, col]): + component.append((row, col)) + + # Explore neighboring cells + dfs(row - 1, col, component) # Above + dfs(row + 1, col, component) # Below + dfs(row, col - 1, component) # Left + dfs(row, col + 1, component) # Right + + for row in range(max_row): + for col in range(max_col): + if (row, col) not in visited and not pd.isna(sheet.iat[row, col]): + component: List[dict] = [] + dfs(row, col, component) + min_x, min_y, max_x, max_y = _find_min_max_coord(component) + connected_components.append( + { + "component": component, + "min_x": min_x, + "min_y": min_y, + "max_x": max_x, + "max_y": max_y, + }, + ) + if filter: + connected_components = _filter_overlapping_tables(connected_components) + return [ + ( + connected_component["component"], + ( + connected_component["min_x"], + connected_component["min_y"], + connected_component["max_x"], + connected_component["max_y"], + ), + ) + for connected_component in connected_components + ] + + +def _filter_overlapping_tables( + connected_components: List[Dict[Any, Any]], +) -> List[Dict[Any, Any]]: + """ + Filter out overlapping connected components to return distinct components. + """ + sorted_components = sorted(connected_components, key=lambda x: x["min_x"]) + merged_components: List[dict] = [] + current_component = None + for component in sorted_components: + if current_component is None: + current_component = component + else: + # Check if component overlaps with the current_component + if component["min_x"] <= current_component["max_x"]: + # Merge the components and update min_x, max_x + current_component["component"].extend(component["component"]) + current_component["min_x"] = min(current_component["min_x"], component["min_x"]) + current_component["max_x"] = max(current_component["max_x"], component["max_x"]) + current_component["min_y"] = min(current_component["min_y"], component["min_y"]) + current_component["max_y"] = max(current_component["max_y"], component["max_y"]) + else: + # No overlap, add the current_component to the merged list + merged_components.append(current_component) + # Update the current_component + current_component = component + # Append the last current_component to the merged list + if current_component is not None: + merged_components.append(current_component) + return merged_components + + +def _find_min_max_coord( + connected_component: List[Dict[Any, Any]], +) -> Tuple[Union[int, float], Union[int, float], Union[int, float], Union[int, float]]: + """ + Find the minimum and maximum coordinates (bounding box) of a connected component. + """ + min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf") + for _x, _y in connected_component: + if _x < min_x: + min_x = _x + if _y < min_y: + min_y = _y + if _x > max_x: + max_x = _x + if _y > max_y: + max_y = _y + return min_x, min_y, max_x, max_y + + +def _get_sub_subtable(subtable: pd.DataFrame, first_and_last_row: Tuple[int, int]) -> pd.DataFrame: + """ + Extract a sub-subtable from a given subtable based on the first and last row range. + """ + # TODO(klaijan) - to further check for sub subtable, we could check whether + # two consecutive rows contains full row of cells. + # if yes, it might not be a header. We should check the length. + first_row, last_row = first_and_last_row + if last_row == first_row: + return None + return subtable.iloc[first_row : last_row + 1] # noqa: E203 + + +def _find_first_and_last_non_consecutive_row( + row_indices: List[int], + table_shape: Tuple[int, int], +) -> Tuple[Optional[int], Optional[int]]: + """ + Find the indices of the first and last non-consecutive rows in a list of row indices. + """ + # If the table is a single column with one or more rows + table_rows, table_cols = table_shape + if len(row_indices) == 1 or (len(row_indices) == table_rows and table_cols == 1): + return row_indices[0], row_indices[0] + + arr = np.array(row_indices) + front_non_consecutive = next( + (i for i, (x, y) in enumerate(zip(arr, arr[1:])) if x + 1 != y), + None, + ) + reversed_arr = arr[::-1] # Reverse the array + last_non_consecutive = next( + (i for i, (x, y) in enumerate(zip(reversed_arr, reversed_arr[1:])) if x - 1 != y), + None, + ) + return front_non_consecutive, last_non_consecutive + + +def _single_non_empty_rows(subtable) -> Tuple[List[int], List[str]]: + """ + Identify single non-empty rows in a subtable and extract their row indices and contents. + """ + single_non_empty_rows = [] + single_non_empty_row_contents = [] + for index, row in subtable.iterrows(): + if row.count() == 1: + single_non_empty_rows.append(index) + single_non_empty_row_contents.append(row.dropna().iloc[0]) + return single_non_empty_rows, single_non_empty_row_contents + + +def _check_content_element_type(text: str) -> Element: + """ + Classify the type of content element based on its text. + """ + if is_bulleted_text(text): + return ListItem( + text=clean_bullets(text), + ) + elif is_possible_numbered_list(text): + return ListItem( + text=text, + ) + elif is_possible_narrative_text(text): + return NarrativeText( + text=text, + ) + elif is_possible_title(text): + return Title( + text=text, + ) + else: + return Text( + text=text, + ) + + +def _get_metadata( + include_metadata: bool = True, + sheet_name: Optional[str] = None, + page_number: Optional[int] = -1, + filename: Optional[str] = None, + last_modification_date: Union[str, None] = None, +) -> ElementMetadata: + """Returns metadata depending on `include_metadata` flag""" + if include_metadata: + metadata = ElementMetadata( + page_name=sheet_name, + page_number=page_number, + filename=filename, + last_modified=last_modification_date, + ) + else: + metadata = ElementMetadata() + return metadata From 9960ce5f00d6b2a518bfbe5cb7533bcc9f0bc333 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Wed, 4 Oct 2023 15:14:21 -0700 Subject: [PATCH 2/2] fix: chunking fails with detection_class_prob in metadata (#1637) --- CHANGELOG.md | 3 +- test_unstructured/chunking/test_title.py | 40 ++++++++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/chunking/title.py | 9 +++++- 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9ee29ccd1..e3b73ba615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev10 +## 0.10.19-dev11 ### Enhancements @@ -20,6 +20,7 @@ Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result. Fix: Updated code to deal with these cases. Importance: This will ensure the correctness when partitioning HTML and Markdown documents. +* **Fixes chunking when `detection_class_prob` appears in Element metadata** Problem: when `detection_class_prob` appears in Element metadata, Elements will only be combined by chunk_by_title if they have the same `detection_class_prob` value (which is rare). This is unlikely a case we ever need to support and most often results in no chunking. Fix: `detection_class_prob` is included in the chunking list of metadata keys excluded for similarity comparison. Importance: This change allows `chunk_by_title` to operate as intended for documents which include `detection_class_prob` metadata in their Elements. ## 0.10.18 diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index bc8bdcc6b0..1e5bc88ef8 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -288,6 +288,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars( ) +def test_chunk_by_title_drops_detection_class_prob(): + elements = [ + Title( + "A Great Day", + metadata=ElementMetadata( + detection_class_prob=0.5, + ), + ), + Text( + "Today is a great day.", + metadata=ElementMetadata( + detection_class_prob=0.62, + ), + ), + Text( + "It is sunny outside.", + metadata=ElementMetadata( + detection_class_prob=0.73, + ), + ), + Title( + "An Okay Day", + metadata=ElementMetadata( + detection_class_prob=0.84, + ), + ), + Text( + "Today is an okay day.", + metadata=ElementMetadata( + detection_class_prob=0.95, + ), + ), + ] + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + assert str(chunks[0]) == str( + CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."), + ) + assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day.")) + + def test_chunk_by_title_drops_extra_metadata(): elements = [ Title( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3d63527b85..5af4c987f0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev10" # pragma: no cover +__version__ = "0.10.19-dev11" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 0c5bde799c..a41ae82534 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -197,7 +197,14 @@ def _drop_extra_metadata( metadata_dict: Dict[str, Any], include_pages: bool = True, ) -> Dict[str, Any]: - keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"] + keys_to_drop = [ + "element_id", + "type", + "coordinates", + "parent_id", + "category_depth", + "detection_class_prob", + ] if not include_pages and "page_number" in metadata_dict: keys_to_drop.append("page_number")