From 55315cf645ea10caae21be5947d2f5774b4ab076 Mon Sep 17 00:00:00 2001 From: Newel H <37004249+newelh@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:32:46 -0400 Subject: [PATCH 01/31] Feat: Native hierarchies for docx element types (#1505) Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents. Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). Hierarchy detection is improved by determining category depth via the following: 1. Check if the paragraph item has an indentation level (ilvl) xpath - these are typically on list bullet/numbers. Return the indentation level if it exists 2. Check the name of the paragraph style if it contains any category depth information (e.g. Heading 1 vs Heading 2 or List Bullet vs List Bullet 2). Return the category depth if found, else default to depth of 0. 3. Check the paragraph ilvl via the paragraph's style name. Outside of the paragraph's metadata, docx stores default ilvls for various style names, which requires a complex lookup. This check is yet to be implemented, as the above methods cover most usecases but the implementation is stubbed out. --- Co-authored-by: Steve Canny --- CHANGELOG.md | 3 +- example-docs/category-level.docx | Bin 0 -> 11220 bytes test_unstructured/partition/docx/test_docx.py | 81 +++++++++++++++++- test_unstructured/partition/pptx/test_pptx.py | 2 +- typings/docx/document.pyi | 2 +- unstructured/__version__.py | 2 +- unstructured/partition/common.py | 3 +- unstructured/partition/doc.py | 4 +- unstructured/partition/docx.py | 50 +++++++++++ unstructured/partition/pptx.py | 2 +- 10 files changed, 137 insertions(+), 12 deletions(-) create mode 100644 example-docs/category-level.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index 81a04a1890..68fd83f0db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev9 +## 0.10.17-dev10 ### Enhancements @@ -9,6 +9,7 @@ * **Add document level language detection functionality.** Introduces the "auto" default for the languages param, which then detects the languages present in the document using the `langdetect` package. Adds the document languages as ISO 639-3 codes to the element metadata. Implemented only for the partition_text function to start. * **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape). * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index. +* **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). ### Features diff --git a/example-docs/category-level.docx b/example-docs/category-level.docx new file mode 100644 index 0000000000000000000000000000000000000000..2611c2cd2987828674806e444ba68b017afaa4ad GIT binary patch literal 11220 zcmaia1z223v-SW%0|a-1y9U?b?(XjH?(QzZH8_I=cMGmTLx7-zyGwu{_LI%-{qNqI zr_VEedfv9`sqU(N<)t9rpaEcDU;wJQTxx*dlwYb`!>Xl;Wl6x!@qBTys7fWudNUD;Xnde1AR{E*ZnyyUzG*v9WZ~ zGhDzMH<=+4m6W^a112v84Fht#7T$ds{uDR>ApgHk5c10}jO~o%9qsI$7>w*3P3YZi ztYZ{)>;f6ld{(OoeZtQ32Uz`w8*f>X$P60~fi!A|P(>CMyIp!<0!?={OGyf+nR z8|o<&V9DDJ;__>ELts7oSy99Snbvy~355vgoOD)Tuv`J6QJ3_li8Y!%QRQTmeFEI` z;^w`^fvyAMfwiXGrKQ)Eg6fW16_Ge^RRYLN^y&e4_fPO@`sT6-{lI)0F)`=Hu|;#L zoAN_=$I$#>xZrObr{Lb-aO6`u@A8bQlF`{IVA+cxILqTQ;(CiM+R67nH$DTpfH?~* zrSzCbgD#7+f5yq@8ZRH`HLIy?qET%7Jp9R@;-qYU-8)=bUFh*=J@>40#jVQhE#Aw3Qz*2|)5-2M% z)@N709-3R5t$-6_^$qH$zz-hltQoan^>6y`*v`LPN&=8uuFPhQsU=3 zv=iTg*O?=SS6&kBbNc6|hcuJ(eXsGR%3^;9+*4-39BC6Re>Q?S z*$v4=`FN289><0ee3YHxeH5xhT12*NVW6P`&>4X+%#VVyBqnpvV$C(VKp_0hYw2Jg2Z2f_g{n&3SjW z&-y5ZXuNuI`_qod%cjuRYzgyqTwAU+SKDCWNMWcIH1d&t)rqk8*B|J8=#LHyH zab(U?u|_RgFSA@Ba8{;#R8K#}E?=wx-MP;nx_|WJd24JO*ikz#vuv*AclfJ;6}RR1 zV~OiWG?~*#Lt(;7?1=`&Heg2K9WS0K?-KsFk1T!Q;W6G_713l^fA+v_zz7gqDeZ$O zgH=Fh;hoogYJ;_T3D7gG5woXhTr^YV=4^h%eufRN77AO(qEIOpe-+!s`JA!hXWU}R zURzLZq1Xcz$4}}SO9ur`f~}FfMEwZ;n>Q@76AJP)dY8Yf&;8Vh_)^Q*Nn86hGwO!P z!X~6m+fmf@8@*z2d!emXh(GNTvnj5BCzt3u~ig!bj! zZuQp@%ot&%Ye=aNMK>U&+qZ}${ry3MHc36Wh7{v8^m=ciK@T^0n&H1!Zj4N*gqrG9{_i66D(I%U3#^4w3Nar8;~7AOQc)d9P>C4gi*KR zyL)wx>!a%w1kxYSF(u5KAtK)Y&}XdWM|N(+D;w#p{UA(V0k7O3&7clKFAB1RZcT@a zwZI|f{`zZk?>>d}V)xq#L3LFPL7&;I z>Jl>T7U(0#?KlbQxN)z?q|aj*PbUEp{#CNZUGZ09xhwZlp4m{FUD!#Vrl;A1V#%oD zn81V)37`@twFXxFKGnKf*=c?Z+_h7&kE=sNZ8=tssLbPSY>Ko zfIS9MrOe$FSmQ%l*vFDdmUuw-L(xHu7q5PR*zY{XB@hvj_^LV=A3Ojh#d;DT7$qBT zIsC1U$gctCr@xdT+;V8;ju8%4A4^el@tn`SHl z2One_2ZsqPe#sOyuE%D3>u^UTk8+pKg1F(;N_HYc_VrE0R;svhOnX6&~r!|5^^ahA|nD6;Kw zfGnv!weH>&X)6X@1rP`cx{ zWF{;{;g4~Z9CVC~4W!seWUL|Vv7qe*)KvGYCkc4*ON_ecwg?@y5zcr68{N=BA5u%C~W8$mU0-&|NluC1PA0;}w+UfjJ z0)kxQtyy=!%f4Z6l2a7yeGEJj+AMgcfVfp3Rk}7l;%cZLO$6o-+_lpW+22P z;##ebuJkP}@;mvHmN5W&iPeE}xO|cp8@s}RQj_1{(N`td4#>5uLqWY2m;%h(oQRc; z$v_F~H_d@n&WfjZ4}*)ekkNF|yAy*gW~hUU+k#w6k7YZ*^j^0R!0=|!8z=yP0p*{z zkaz#tLY$mEtWBI=H`}9ZEvKbcPnVg!}vNk_6u|-x(coYYCY6D=jKRyjQp?SvfNQ6XBLKUteLcNdk!4AKD zvbJ~gIy*LAE5eR~z!*u(MbLHUy4~#4^Y!!0HDw@@<24~DImw-Xs-V^CB0yB~>nBNl zB46N9U8OTYsg?GTJ`R&(f}M}J^vIz0MnfEgh8F7!k8qba5LE(12Imeb)7PzfVy`QjaU# zoNPTuK9=Dv4D#!E`?Z!9JBLtla7tE;`bVf z7bwm20WM#WhETsm4=%Q!%3aav=}J5YKwn7vIIwYGp@q2V*`g4k7CkPmUDnfPU5IN( z60SKMg`G*h`4v_^qcaj@bC-)bP%;9pw_SVh<>+m$O)ecYIr&k~jJjsF>B9`T_Z(FI zlM8(#H2d~*xqji7h0H^>ti7R2(T8v)7Q^LmajS6##4iii$oKq7OhYB@65pJoeP^hW=+@b<>iOBVD5Yi>}Fp$%!%&b^lyHk zCLYeKaWGFcqrqnl>@FkOrbyHuGc(@oOA&v#NeOi)xM188uB>LEH&uUhf(Y8n#lez( z9_!Y>r+{J|FKsRTjC`dmL9$h1-w;o306>d7(G$@rK z!R+XdjfybB`94&G4)VVx9&S2FkG-quf#O}#Bw*rqh}l=u*6G~R>#PIQU6#NWAeyrjSIje zmKg5M!(YtcIPBbrT0-8KW+GsrXS^6mx19c-vOHC*rp)(9_&rQZrQ&M>^$#sSp zljV5wNG9rpF*;S>-JLZ%4f(~+)1*eC5w^-*<$lf;(`>M6JYLDmjE_hYZJInGCKrse zg>>-n@&T=kG{`c$3{C;~Mrf0evFny$e;vH*yv3D)oK!)<>q2U{&NBBJ>NXA3PoxrO zA|x&|DGrAm#Dj3SoO?g0KMQBp+BoU6LhT?IB>jn z&+ZB>Huqg)Xj=K!7zlJ%9tXP59v$WWO5wpCF-{M2##aX$_N)OQ3vUU$f zVB^`pMrn;+N46fu{<(^Z93NqzuPlj$l>!eXS`-kF05|mvmyJ{TY{JE={%qv}hxH95 z532B-d3rB*msSoz6l4bVTS}69uSY*eoMszYMpWiq75X{Td?Q-SD!pba1m=!&>n2)1 zs6KCn)UnJ?H{QVknZ4hUP0u(v>Ihtn?EGCPVMT_i!hDpzSaL{yvapxc66~P*vF_M) zMtTchp&Pob;PyodJVcKAkJ$`{$%QdY`-A$2;Y1pTu95aT%ep1KK|K6+OAWF^ZqsUl znzG5b0bofwc!HRU$ogEJSkm#p8p>kf-oe09%GX$sftZy0Sm6R)Q^gkk=w@u{X&MDS zT~B|fn9bQ~1lhZoSo`H|zD&Hy4evo9#Wv06Kwl&0^uU*4$hf6;a|-2lr_}|sw|QJE zh6A>%bnb`OX_Q}-7}t$UoMUOY3+TE#FT@hqjupBbpS+x|-eBF7WQ$GkIi&~8%x)gD zyZ9))E#Uy^l9)|Dew~}3G&&ia=!?Ps=CNE$!u5{dRk$Q!$TAiu@i2M9DCy;O{1=j` zLi8+T`$F{k!5G!4pC?zZFAkW2-@A@A}Z)hgDI$OpiUWrL$D{_9l; zjB{QuBo@7VKJQEFkjfK`d`WOz@9@j{=3eU-d_yfC(4zF0D((3gnf|Z#(24(NKLc?O69V9l z9A1*n%K)_hG6=yp|LHTFvi>nzX?)!3V z*siZ2`>vGU*y6o{Lh3QYCReQ$sIpUUa|zT6w6OodwwuvhNiV4|gUo1)Yic<_66LXE zuEQLrv&<&C0B`WFDQ1jS=zEXNro;H@)J)Jt=as>rFV_!kJvbPfV|5a|C0H9<*oH61 zM{XG7vb*Iu*;6W_++al|=s}qCv4S+#Vl5H~eeg65UU@%Y)ys%_BrN?MKH(H(6@^l= zm#ue)r{JK}D7L+ua1BhF1#Hr@3Uf*!DA+3YmBG`nlIj;p)@J^S zFu^JmW?g1_ltwoP6T65qJ3^;b9<`lHH#g@rn!3-_Y9hri*JNF$p#J(WXtExrXQkJ& zO#Hzszw)Jsyv$)SQN>cbahCe^Z3@mOy2YXa6Db+R&DX*eXtNiwm%j`$zvr4KK|bzqb5G^_s8EFN_Ey@qk;tuYGX@b1>b8r&R{- z`La&wW!$N=j-~8ire3`E-bH9tMM+RevhB z92?*Lo|-C=o~g;qXvVMSPSNp&00q|QGGCfV0(X=Xw`trC`|)ORHFj#bpF#^Q%k4Im zNZC|zj~MKns+GCQi}F<@&T(uRKy^j$Hhk{#)ht0i8AW7k&oh0dkLtIeyOU|V_Z)hD zKzzT0qWer5A&#qXwt z7UHQDB;bdyhbJpHE^muy1myuzHVr4RiM(y~aemFCRe{7~a*o!r);EttQaaiub zcN;uxx}35_De7b@u$)Uyn)&N{y3QUB1!cW$7?MY2>k3*>DB9 zx9M;@pZ6ZMv7O2Mz|L&=?2$1%R4JN~QJ0R@ZgY7UVq`?2vGXSJZtc^m62cfzmPwRr zIBiH`pet+3&(hvHyTj(Ee@5#Sv-M@r6mU4o$I!>7mL11UYfdKuQZABWWGn)ee7T5)_r}y?E_jrw?7@$)>wZ%syx=^XDv0 z+(`?fd(uuaS$Xl`0BojW*SGqz3LRXNoNr%N36;Ljrs}*a@8E5faLkmfQprxSt6HkI zQF+qbVdwBp$vR*)Iy1yw8gXC|ZE0H(5;;4eJ<95Cz5e7o)5E+AEJ&`b{weEwl)Plt z$I@bb^zB1+!@CL3q%xtZWs4-yuQ&zl=9lAy+t4(usZ?#_a^8=hJ9lW!4DO~FKLHqY z`_wwHjN8goctguvltz~Zy!#^8(7FN}7no-ra=p-hV4Gdt6sgw~Gz@;esj_K=iF&W< zH|CC|Tep#<A2^W(Qp>9LyjwHH6ob}Rp z3i}3+E_YZV25-IrUMQqFhe)aYGZ^zlvG@fgr$e0pu=aAU&godZ;bwy{05FPCW`61C8v?q2z9aSp#o}$??b!x;{rII18w?zHyYwJvcyk?L0znQ zB#PU1tr4W9MqA4Ta75$s9g#stcYUnD)7(CHV=P6`JHX3^Fcw169KG z!kZ>DwYaCjqGAX|-|blR4m_%C zbBhm(cTLd`MlzwT7;Tok#L~23m++5#m7#B07%@h%oX9M~uV?d?K$)33gv`ha2+d%@ z@#Nn_%#CbtIAV!#scW#yk|n=OC1ZT=1?qBh7i}uj2*!ZP_293%n8kKW7cfP`DHNN? z4`9KG)-zGp3Kan2FcTfDPi1$`G>G3`@S5?OkpdR9G!qYzK3 z2R~rY$apA@r#8%5^hs!ktGgh_bcp2?TT{zv*GJ^xZE7BwBoeo8N|oFU7cwG5(iGIP zH?AD``qRu%zO5}PzY8+hj??;sy5JTD#%@ic{*qx%P!=e^i{|CekG(Vd-nIKI;3IyM zulJ*Gy&qvp%FO5B5(1Y_bq=W(RV?v#O&navw zz{l$PvhG8_A{h~bXBwOGm3uu4rS!RH$xO+@C$%5SJ(so9Qnf^H6Nu#amgdmJV!%uI zb(`p$*YJCbGWdKuwE zPae<}jtfKjSX9thY<|ExU`cal&KISgEA$8ScIj#IXEeg(!#0!EUVOZ!8Qt8IrKuGe zG?t+c3sQz-385RVbh>Nt_VZ2+e8ruZM5Z-Kqq^QMxvN<719Mdrhww8+WS|%MEP7Gf-hw_>b zw6;>8qhqCPeL5I^q;8@phLd%F*qr`-Z~@fdchE~Xj!DL3`WZsT($CYy$CJQ(2=i*s z=WVQhQO#R2=~?h&mnUm5>RQKU5j1c#MM5`&VkF5T&K046hV^5H9s*qC~Mr2R}@rFEd%HdoXM4+!M1oK@VdT9x{W}hlnbNSV8 z`6(W`hg;IU&&i9`3sEmWTF@DE4$BbLfQ5(h1J!@5#lKK^vkn$7fPc-~|C>Lt4Q5CJ zOgpSN3~FDo&9$_ccZr|v{0I+RBO#^uju6-qmgJo|ZJv2=<~?1$3Io4xm#5497HBVN zRWA1A2gxjm4Lw?yeLtS)a3pUo*yH8-^eQzqy)p05~e8RweciQ zxtRhWAgd5exIg9*>y-MlmxQ6L2ZhUWDgx}pYTqoAylH|}soHbpQu_usL#iw%TOzW| z3y4*Ada%+xmZcy*PLM1j5Qy0Rh(QlcBu%(5ZDtIgFL+2Kn5c6j&ksvNQpGE*iAz`{ zac|x|C&*2S;hSCt1Lr5zxagJF0IOPd(oKR*bq&=`oq{A~+74Tc_f>L$a6n{>ek~z# zt0x*Nu?HJ72a$+e(+vh&X?VGaIpWwR!F1MkwXaTHR55mddW)X$bNFF=EkS^L!B%u) zY4~GSj7T`RO|Cr|Eb&QvJ;N_RqM{oge|)Z;-72PqI6mQC5|xS#toYU=M;{45Wkic5 zRna<%*u=TMBp>ZZ8j= zrG3KO@h302F(My7RF0M?={56G@jFIkXla6QYM~U{kRfHdMB5T+>+)pW;Y?Q$lo4<@ zLgD-;oH_9NjK2CB8GYb@AV*KL!|5zVR)a~S+HS=DKqW?Q7s=;dLJEjQdATAW(OWZqBSD9v#*_eY!4*Qkp!;9*xv;Vo_XuTcp;UQhgTa zR?7&diyW2&HI2EJlgW&-PA=$~c2D(5&;6#CSI1P^W3kcD1L3BZhc1CKI8(X+DKvqD zXFiPK`$A0UkGaM~QJ^n&CmWj+gzQ$1EA4gGCM+vh8p~C}?I#JA&4{VNX=zCNm%RTZ z!Xr_-i6Hg~00UHWsL=U@(95L1!I^b6a@uFqG76k0 zqWhOH75N2TPQxc>xPzgzs_GH>bbF+0`i)tit|`rFT26k;B8|gy#5-^rvGEb5?N-qh z?;EPKGibp8O&ytW1tl78Zu%NRN@+E^+Dx7O@i-oY`NDb1*%S@CO~XuhM3F5w;Rde$ zEKv1;tnl3tLlxY-Vl>t(vEl_spQQ^L)0G2N?Lk%=muir*X3mbf&Lk7J z&BM!V#~F@}?i@$>U7uO|+U3UBEw2nlDL!cO8wMG>)Fi-~x!t7d__|dS8KjG;v@O`< z99b5v*L$Kj%~Yx3_%`V5DL-nKESzV7W-#l zF5A*LhHsDEn~~QBbcRTnJFa_NCpeDpsN#kV8WM4~pFz6|;N!A@A3)-X2_MV^v{Jr- zzAPn^M1jH07V@ReT$r%(VC!t)M0qN+`I&8P=YRC;id8$;+ch5Wcjw}6e@Upj!mA+?l??QHg&_eF* z%y~Yt6BFOQI}a&HW=dK9i&!E^I#%rdavqs^QJ%aM7&sc>PyXWX#JyMk;$P)AHsjxc zzjMi69o|2M=%uOu!See%{CC>ZE8*-<8GJd<{ulml`q|&nzmtnz7u=ts_2xJ9pG2v@ zSMWR6=9Po=r!c=9+Wv`=^!J*6zjFQSJea&p_y1t({2l&#MB(*b_D?B=|2O>4TiU;a ze|Ka5+BOW%f55+mApQ>j-FtZT5&o1Z+<(LW;`#m^|NA-julTh0|Hl8t str: + filename = str(tmp_path / "mock_document.docx") + print(f"filename = {filename}") + mock_document.save(filename) + return filename + + @pytest.fixture() def expected_elements(): return [ @@ -85,11 +96,12 @@ def expected_emphasized_text_tags(): return ["b", "i", "b", "i"] -def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir): - filename = os.path.join(tmpdir.dirname, "mock_document.docx") - mock_document.save(filename) +def test_partition_docx_from_filename( + mock_document_filename: str, + expected_elements: List[Element], +): + elements = partition_docx(filename=mock_document_filename) - elements = partition_docx(filename=filename) assert elements == expected_elements assert elements[0].metadata.page_number is None for element in elements: @@ -416,3 +428,64 @@ def test_add_chunking_strategy_on_partition_docx(filename="example-docs/handbook chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_parse_category_depth_by_style(): + partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None) + + # Category depths are 0-indexed and relative to the category type + # Title, list item, bullet, narrative text, etc. + test_cases = [ + (0, "Call me Ishmael."), + (0, "A Heading 1"), + (0, "Whenever I find myself growing grim"), + (0, "A top level list item"), + (1, "Next level"), + (1, "Same"), + (0, "Second top-level list item"), + (0, "whenever I find myself involuntarily"), + (0, ""), # Empty paragraph + (1, "A Heading 2"), + (0, "This is my substitute for pistol and ball"), + (0, "Another Heading 1"), + (0, "There now is your insular city"), + ] + + paragraphs = partitioner._document.paragraphs + for idx, (depth, text) in enumerate(test_cases): + paragraph = paragraphs[idx] + actual_depth = partitioner._parse_category_depth_by_style(paragraph) + assert text in paragraph.text, f"paragraph[{[idx]}].text does not contain {text}" + assert ( + actual_depth == depth + ), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}" + + +def test_parse_category_depth_by_style_name(): + partitioner = _DocxPartitioner(None, None, None, False, None) + + test_cases = [ + (0, "Heading 1"), + (1, "Heading 2"), + (2, "Heading 3"), + (1, "Subtitle"), + (0, "List"), + (1, "List 2"), + (2, "List 3"), + (0, "List Bullet"), + (1, "List Bullet 2"), + (2, "List Bullet 3"), + (0, "List Number"), + (1, "List Number 2"), + (2, "List Number 3"), + ] + + for idx, (depth, text) in enumerate(test_cases): + assert ( + partitioner._parse_category_depth_by_style_name(text) == depth + ), f"test case {test_cases[idx]} failed" + + +def test_parse_category_depth_by_style_ilvl(): + partitioner = _DocxPartitioner(None, None, None, False, None) + assert partitioner._parse_category_depth_by_style_ilvl() == 0 diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py index e4c0e75426..3540c020e7 100644 --- a/test_unstructured/partition/pptx/test_pptx.py +++ b/test_unstructured/partition/pptx/test_pptx.py @@ -112,7 +112,7 @@ def it_recurses_into_group_shapes(self): elements = cast( Iterator[Text], _PptxPartitioner( - get_test_file_path("group-shapes-nested.pptx") + get_test_file_path("group-shapes-nested.pptx"), )._iter_presentation_elements(), ) diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi index 779b6513dd..964b357300 100644 --- a/typings/docx/document.pyi +++ b/typings/docx/document.pyi @@ -11,7 +11,7 @@ from docx.text.paragraph import Paragraph class Document(BlockItemContainer): def add_paragraph( - self, text: str = "", style: Union[_ParagraphStyle, str, None] = None + self, text: str = "", style: Union[_ParagraphStyle, str, None] = None, ) -> Paragraph: ... @property def element(self) -> CT_Document: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7100ed29da..2eb7cd1430 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev9" # pragma: no cover +__version__ = "0.10.17-dev10" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index a328896edc..319be80608 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -460,7 +460,8 @@ def convert_to_bytes( def convert_ms_office_table_to_text( - table: Union["docxtable", "pptxtable"], as_html: bool = True + table: Union["docxtable", "pptxtable"], + as_html: bool = True, ) -> str: """ Convert a table object from a Word document to an HTML table string using the tabulate library. diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 18ae998c67..5b3283cd18 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import IO, List, Optional +from typing import IO, Any, List, Optional from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata @@ -26,7 +26,7 @@ def partition_doc( metadata_last_modified: Optional[str] = None, libre_office_filter: Optional[str] = "MS Word 2007 XML", chunking_strategy: Optional[str] = None, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Partitions Microsoft Word Documents in .doc format into its document elements. diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 7aefbd9be4..506e1e6459 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -411,6 +411,7 @@ def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]: metadata=ElementMetadata( filename=self._metadata_filename, header_footer_type=header_footer_type, + category_depth=0, ), ) @@ -438,6 +439,7 @@ def iter_header(header: _Header, header_footer_type: str) -> Iterator[Header]: metadata=ElementMetadata( filename=self._metadata_filename, header_footer_type=header_footer_type, + category_depth=0, # -- headers are always at the root level ), ) @@ -554,12 +556,14 @@ def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata: """ElementMetadata object describing `paragraph`.""" emphasized_text_contents, emphasized_text_tags = self._paragraph_emphasis(paragraph) + category_depth = self._parse_category_depth_by_style(paragraph) return ElementMetadata( filename=self._metadata_filename, page_number=self._page_number, last_modified=self._last_modified, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, + category_depth=category_depth, ) def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]: @@ -634,6 +638,52 @@ def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]: iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table)) return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2]) + def _parse_category_depth_by_style(self, paragraph: Paragraph) -> int: + """Determine category depth from paragraph metadata""" + + # Determine category depth from paragraph ilvl xpath + xpath = paragraph._element.xpath("./w:pPr/w:numPr/w:ilvl/@w:val") + if xpath: + return int(xpath[0]) + + # Determine category depth from style name + style_name = (paragraph.style and paragraph.style.name) or "Normal" + depth = self._parse_category_depth_by_style_name(style_name) + + if depth > 0: + return depth + else: + # Check if category depth can be determined from style ilvl + return self._parse_category_depth_by_style_ilvl() + + def _parse_category_depth_by_style_name(self, style_name: str) -> int: + """Parse category-depth from the style-name of `paragraph`. + + Category depth is 0-indexed and relative to the other element types in the document. + """ + + def _extract_number(suffix: str) -> int: + return int(suffix.split()[-1]) - 1 if suffix.split()[-1].isdigit() else 0 + + # Heading styles + if style_name.startswith("Heading"): + return _extract_number(style_name) + + if style_name == "Subtitle": + return 1 + + # List styles + list_prefixes = ["List", "List Bullet", "List Continue", "List Number"] + if any(style_name.startswith(prefix) for prefix in list_prefixes): + return _extract_number(style_name) + + # Other styles + return 0 + + def _parse_category_depth_by_style_ilvl(self) -> int: + # TODO(newelh) Parsing category depth by style ilvl is not yet implemented + return 0 + class _SectBlockItemIterator: """Generates the block-items in a section. diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index cdd0a0c1ec..9ba4766dcd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -95,7 +95,7 @@ def partition_pptx( include_slide_notes, metadata_filename, metadata_last_modified, - ) + ), ) From d26d591d6ab6cde8e067189e99a41ef7c22d1a46 Mon Sep 17 00:00:00 2001 From: Klaijan Date: Wed, 27 Sep 2023 13:43:32 -0400 Subject: [PATCH 02/31] feat: get embedded url, associate text and start index for pdf (#1539) **Executive Summary** Adds PDF functionality to capture hyperlink (external or internal) for pdf fast strategy along with associate text. **Technical Details** - `pdfminer` associates `annotation` (links and uris) with bounding box rather than text. Therefore, the link and text matching is not a perfect pair but rather a logic-based and calculation matching from bounding box overlapping. - There is no word-level bounding box. Only character-level (access using `LTChar`). Thus in order to get to word-level, there is a window slicing through the text. The words are captured in alphanumeric and non-alphanumeric separately, meaning it will split the word if contains both, on the first encounter of non-alphanumeric.) - The bounding box calculation is calculated using start and stop coordinates for the corresponding word calculated from above. The calculation is simply using distance between two dots. The result now contains `links` in `metadata` as shown below: ``` "links": [ { "text": "link", "url": "https://github.com/Unstructured-IO/unstructured", "start_index": 12 }, { "text": "email", "url": "mailto:unstructuredai@earlygrowth.com", "start_index": 30 }, { "text": "phone number", "url": "tel:6505124019", "start_index": 49 } ] ``` --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Klaijan --- CHANGELOG.md | 1 + example-docs/embedded-link.pdf | Bin 0 -> 15804 bytes example-docs/emphasis-text.pdf | Bin 0 -> 16576 bytes requirements/base.in | 1 + requirements/base.txt | 4 + requirements/extra-csv.txt | 1 + requirements/extra-paddleocr.txt | 1 + requirements/extra-pdf-image.txt | 1 + requirements/extra-xlsx.txt | 1 + requirements/huggingface.txt | 1 + requirements/ingest-delta-table.txt | 1 + requirements/ingest-openai.txt | 3 +- .../partition/pdf-image/test_pdf.py | 42 +- .../898538f2-26e1-4de7-81e6-354045d4d007.json | 28 - ...iomedical-Data-Scientists-2-pages.pdf.json | 75 +- .../biomed-api/65/11/main.PMC6312790.pdf.json | 682 +++++++++++---- .../biomed-api/75/29/main.PMC6312793.pdf.json | 496 ++++++++--- .../07/07/sbaa031.073.PMC7234218.pdf.json | 42 +- .../2023-Jan-economic-outlook.pdf.json | 822 +++++++++++------- .../small-pdf-set/Silent-Giant-(1).pdf.json | 588 ++++++++----- .../recalibrating-risk-report.pdf.json | 435 ++++++--- unstructured/cleaners/core.py | 46 + unstructured/documents/elements.py | 2 + unstructured/documents/html.py | 5 +- unstructured/partition/pdf.py | 298 ++++++- 25 files changed, 2552 insertions(+), 1024 deletions(-) create mode 100644 example-docs/embedded-link.pdf create mode 100644 example-docs/emphasis-text.pdf delete mode 100644 test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 68fd83f0db..4569f5f12e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### Features +* **Adds `links` metadata in `partition_pdf` for `fast` strategy.** Problem: PDF files contain rich information and hyperlink that Unstructured did not captured earlier. Feature: `partition_pdf` now can capture embedded links within the file along with its associated text and page number. Importance: Providing depth in extracted elements give user a better understanding and richer context of documents. This also enables user to map to other elements within the document if the hyperlink is refered internally. * **Adds the embedding module to be able to embed Elements** Problem: Many NLP applications require the ability to represent parts of documents in a semantic way. Until now, Unstructured did not have text embedding ability within the core library. Feature: This embedding module is able to track embeddings related data with a class, embed a list of elements, and return an updated list of Elements with the *embeddings* property. The module is also able to embed query strings. Importance: Ability to embed documents or parts of documents will enable users to make use of these semantic representations in different NLP applications, such as search, retrieval, and retrieval augmented generation. ### Fixes diff --git a/example-docs/embedded-link.pdf b/example-docs/embedded-link.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4aa0d213242bce609cb01ddafde367453985fe8f GIT binary patch literal 15804 zcmdVBcRZE<`#4S{dz6UKNs$(3AF?-*y+`J;9pV_7DI*~i8HLP@Y=x}sO+r@4mK7zk z^1TmvMPBdTzsLLY*N-~KecjLdTF>iwUDthG&nKILn8Z~GCybP>X5#C_?2B*D-__KS z!a!h&rhbaua{E1RO{cRc0HU!>UQe7kiK=N?JaV;`C%tIjb}%VBsHLn zr`lSgqVm7_N|p-0Xhz67m;3YA!|bl8C3~fMcj%zSZ0^6%+`ifW@w3$FFJacXb))#| z>KY%j%ZDEgd`Dm~>JK`?B)n!g5$#SlsA5S5@)6prOhpglquxy93Nx{aG1JzccGvvK zSk(%{2Jum&i%9gb^8FQ{YF1(n54x8Sve(ii>#EuhdF~1pk#T%4aYYEo)SJH)g(^kt-;NQEwk=EI_4^@ zx3-d>hs7ipG=4J*b9p&?PnjdJ>V`s7K3ICnD&4BtJ}MB*r`O3$bxpY$`H41y`#an8 zs26j|%?LHbV6tnme0`D}yKIcdqi&K@jWeLJ!PgBuUqx4{RxP$mZBD%RfoUWPKxAdU zEib9K-MN|aHa1qg2O(HCOs8x#NXE)hYe|Tdb3ql&dg3Lv&&pN*W+PZ!&rZ<;&D-g^^~ko6cY-uAgH{q4_?3Yo0Ti&Zxw#~I5N zQ>^Kf@O{KcvOn8qN*tBQ7agIDmLj%j)w`aspyVI^m6X}I`dj6+&vmx;Qf>j+Y0!N> zR@0?Oglqz<>FjI62$A5KYqppbtSE%4C4q*~zQOK(N>j|oYl37pZ7p48lW zRr`!N;F^k>k%;&O*4IpaDMsf7XjcPjJd;7etwo`e)pBVcK$IrVFJ5%z2*NodUk%2N zKQ)g^7UoVPJaOuEM&Cn(MPzn**L^W;P`YX%Z~crvyNHf_#?P1A?^#w-IO|@VfL&X& z9_YE6`>4Rr*hYG(>m_2Iq_)cE*BqL0qCX zRyKCZwgyIM5Y7vT+CxByUuXbd0MOq7tEfOgNF11yOGO2sx;NrCEgWqeEo~6b&P@o8 zLnROdCpqx7D!D z`mq{`(~{l0YuH2bm0F>$Qc+BNVldE>Pc+x89yvvgPpN_bcErSlQ1XY^k*>`${-~%_sk;1?1;*pjn!A4#l59p@|jjuoDL)L_BSi>;q%f6;WQ@o_Us<)DY5!@=~+R&}{o)sLRi(^)qBT4=^LL z!mP@ZE9Ak(c5De(7!rrQttyWk4>^W?P73$p1RuGgPr-BtW=k*x!t=f(MNRO8fK%#- zxe#HVus|;!sW7A$-|?P>Enb(G%R553yGUCi9YXcD6t-mKcO->KeJf?a_(NXi#CI2| zPbLVbhtMTaMTvZ%KCMN3TU1zqo0gzf)IWs!k z{Mb7XoM;Q*-J2%zNT&fsFW&1reL1{L#FACbpTwL`K08MLc3|Xs5uE^KVHN$z=|#LN zK6+J;iG>=72|6S+h)*wxiG*mJ$EP`K8OoY;BuzXzgfi)LagbE#t#JY(zj%YlQ*?C< zV}6=}BZg`Q9}KU9yCaPv)nw@>ktWnIueH0$hTXX-8W&y)I+7r1o(5g5QPU6B*UpK^ zCFnJn<7J|(f6Fk+Wb09MN(YKTWDyop^#qpOnXRT6VPA4Cy);DoX}( zCQp{(;4^g<7eYw12rchYoqYm2Z*^XUnvr;lc*~on3a;ptDvxFLrM_LGV|X$5VXtth zG!whrV_Iq7^YlvRFGdx3sYqL}>az2*mz{mwc+FVS?v$p`P5G?zaUeZ5sz-tfVI1tV z^4FQfMf&7;t``g47P>8(tVEHD>C#A#d=sN3Unb^$-JMJFF)ZR+#8_kI$iFxt@H@ZKn* zR*|jrk=LWb z{P5s(Syh{E+U}Ti>vZ`pum!t?fkmIiMmJ|hQ2UE^r`YxwJ^1$crr%7?8-JIgb&K!S zGyF4;8KW7?85tN=86)6z2!?1(bYt{H^c1|lz)Xwm6`|II*3_$;;P9x078Mh=`~aT# z%&hqrS33lf1~d3GjZ*V9OcUku+fL{{;?zb77wQx$JBi54aA|PqWh;J(7f_7Lj7zGJ ztkCvsdixI4uOA@%v@$jRL41WltU=<4(Yf@{jE+l*T8X@M+(xf&Jy;SL(`L`{kCA>P z9n^S3xwmUbMgL8EK|8tSq!~Lr15r|+kn>Sf^QFXx=RDPX4Q`3IpRQdd;}6gXh@l&# z)4MW%MfvLNl{1NTx!Q|$&6+DPpn<@3m>f-ezExSLg1Q!zd(?{iE_vC40o;{23tqA z9M=cG^(;=U_iV@$audEJnI=sllps_)CO`%yIeyfEu(}ezvgGa=uJ?N9b-L(|h-`{F z`tXq;$aPPy+n?gR0)NRqce_Z{*~8h1@T#G>m}pIjRB*%l2FZXRIvF}AiAKp<@ovc* z;;!Nm;xZ8$S2Xj-B?7eqK`H!nXHK7VJ@06jw)%ch#&2wHlyq!?6Q=Vdf3Dx|)kIoz zdsc{ErRJ?mET_*f4^waZpQuWkkj+}wD|VV7?0H<`xAADI?($U7lI{=mW+-8Vcb<2L zSVWDkzmas7^hxQlBl5zpJ~`y+ef1ho3kwNLya-F{NNY9sF8EQIz zS2a8lEc&L&yn8|QM1oL9s4R!Qrb$WU!=JL>r2@}CWxJ+8@H-2l}@IaU#oE%Kc9e?_6Rw5QGDKmXOWa$hP%Q@**z8U#mJ{xQevX<&wY?f#a zLZ2SSJQ(Uye6E;Nu5XtAWwK7BSVVE8>}s1frB$ufNvrumll)AJfe6-{9p4R`M<0yJ zvweItJFMxhzBbsh;yquMypp`(^vRLf>Eqz%H{Ud(3^i}|z03Nfn>o?)HQ{SviTYf* znt}E$eJb-{!`7ai_(24<#{^ht znMyqyRuwEWg{^B1K90PsZl6tPq*Ec16QJ=^P5a%+C$q;K6}a&=j8EKq|Rq4z@Ogl znmShCO~Ls4n5>+vp=Pg$?3+u)c4-4;8=I;T8Kzl0y1ec&?q&17TR|UKN@Abr&3TxA ze)Yw=?6YnD)|0W!hvi4zr`B~hxQ15VxKBLXjyX*bK(xNVS?;=h^CSOf_o1zhV`)@3 z1zTLWx6HPT=Ef2(kjh_*O&84E>i*GVl2*1n_q<2HJ^#LGuy2tl*LL_?wtV%`_O$lP z5(N@7l9`h2fop+-F1gFKi_L)%ont-~7k|E94e!W^^KjZIGc|0iS+wk!jS{>*lv>WS z{$a&>%6NLb)5-ix%8Lg>8Rdd2UR&`SM$7e~pXcO%$cM)T2|BrvY<_iSw>Fs^UiJRA zaX4S!%S(Z5Sz1(7#K0bH4BAOnl|a9fS6mjnm%Q#K^~zW~2P3SC9U84k z%PyCKDQNc{0H6Tf&)*J}zoL>g$5{R+RImf6_U7w9iuyOM;QLYSsQ0g29oE{e2BEui z1>KeVpw|BDr2oU;?(81W8gf5>J5>J4-#=RW7gVSNsQ#;g{EaK_1E_G;_*bqDYi*Z3 z*sdMJc5URK)^H>Kigb^^UyI>@)_@n#eT!j-%wKWssqJ5JK@Q-;@%mR>f931oBKWVk z4k&K_Vgg>K_X)V;0shL=0Yo^50^4;Ou-!!u+g%d~zk^8T>{wm|a)o{o@aMd3B|5U%h6=56ze-8lyhPkml z&ea15fJn1%dSQn>h^PV9z{8f)Y=gT1A{P*q&;TwVx*Uj4 zq`#uezTmW1G1_AR#HFFB1uz4YnSjsVp$}&=d&&|9!G1+45gR*WpoFXq+{FC?)ur9A zvs0b|s5=;9cWPg|GpvL*0?JV+D6Wph34;L91_I&cgu_q>5E2U4CH>9T4*orIzfc-s z&24}<1H?;JCF$J{*skZ@_uQU?<1oWwZLe{0IXO9T8ra%ep*f9gtS|l(l5w)_p$DY- z{QZza|z$|n_T}3mk=nNlLv|8 zk{kX@^E^mS=nmgJ2bkTVvPbkkYaYI9u)j6G1He7Ufkj(gLn6QkfF>Bi^IPXQ_V*AA zgW$ic_x~mhAjGa6At0P!Bw$b|F!z4d?@-wz`WI<-9pXM|_RO8b+5qU1*j#hK0OegE z({r#x8ylDlp$+V;ZkyWKIAP5W7}oxo24viIu>ZFi10?z-BOJnsF# zPKEx>yx_ZTexHncDb&tdu?H55sX5ln!4M}9m+C%wuS&~v?U(FBLg`!;Hq}&&SGeLBRg|jtc@Khd^f@362|~tDRT_q z@@^OyDcM*XV17d+%>gL8+Gn`+AqPR(Sebb+xz#YIK zcR#@(1OmD9|Ibl-*SP!OKQN(oBJkdbgI5R?ZdSk`9w<(dKYsx~L5kpk;rsQBXD(i;7Jjn89}Ojl?MRCWr|s|$eNFmT?z0u+TH-Qf^uYoehb z;3h7j?lX5BU{4dfT}}vy8v+L?{pxwbk$_tNYqOzM)7?tD$Dv~NtMaDDXd#Eco7-lH z)B{@a(1dc1#wW&&518wbMos=Ne~9Ml^xg{LzWE;Q<@Z|RGPx!6rTJrW>y89--&a0L z&mXqDmI=9H9>E@&Yryb?p%(nSmYPslF=@>xb9$@IHHkG%lZJN9D2(chd&KIX>%y1K z1)iy!dM=;hPnsC1M?Sm}TWRn)E-;m{AR5rEpX~H*@ZKh zwsy=YkKJ%1*=4QmMJI)okDg1FGObfQVWtb7`pJE9J&SHa8`c9TNnI1yD20K7FBo4* z_uzh!3Q>s6LZz3toi^6P_c=d4`B^=gc??U}wa57uSEXW@du z!`YEH;}sp<#wjMHzg-)mZi$%WIx<9h+R&WN1cRNE&8J^#{uy(_A*EiAqpZ@|c(T3i zXbLLwD3x;nglbD$zg{sM69moxgP%IbNdMr;&#COE?2et<7GPJK@rV%!~UZgeZ7yhbN z$2f*LEv9<4Bf3vY45o6s5Zk3Y6 z*B@{vt6fnI0ndOn!R z^7(t0{(RrBWkzvXQJePmy?he_wd?NKr6>x4+o=+QH!j)TXQb5(jjtbt*I}BkFr~E$ zorr?xZn7j3Ms#w*7#^O!iM$rb71e*kC}l%74kg(dXOUmvo;dG2@n`~((eBc)p87rF zKHkx~r3Lw1xf9y)k@%>Hd-8sfx{IzTQ1R?^s_U7|tl9fiDwZXUpBhXp7AuOW5n7i5 zT8Rh{ry^q+m!0uPD7j84h+c825NtYcZ+?7OG|q+!pg4&1;?-cV5%C=T)gFB9Cn(?!`ygr zR9xG%!Gk}CUM@2Q7%A959R5m$mT?Q^g77hj{XPdj%qSf@tPn%_6B?Y?@H&qW8U zh(a~*NAV1GgLDE^e!p2&NnR2+_CXkg?5K^qLnLhR~md|-;cFYwoedYE_)w${gQG%m5^_UW*c8QskdjQhC!A2 zo@I%{jMVcvi;_8|&|CrrS270P5a9bSA8E>*)!0u|KPas>g>;DoqMq_bDwvXJ#4Jj~ zH7m*Pd55-tv7f#%Btj!UfakJdI98S}!w?wGzFb8we?c^1hTK;-%l0h;=a~iy7G{!r zs+)@FE1ilA52#We_jRan_H|@<$7Gqw3w*4opnh|4wh->MWVe(YZHwlRj;Hf_ggNE?&C5v~J@6 z#ZrepyYps}?j0sI9n~q_UUMHR^Q_xMWRtJPqc`5(St}^M=tG!^&)J&QT~JK+Y*Ouq z*VtXb0v8>#{D7ra_2;$0wKreVY>n90a!;ai$5+<6Z<{TpWY#{-F&hk;>=p0Z$e36~ z>Ya=mD!nA&J{@w6?xWvAHu%ZolZBuAK6G`Hx-ECfcdG_bShRLnRbw14znP3Hxg)s5 z#luAm;HpKxq%Km!HmV}0P&3Ye;2WqYJOdd2)-xR+~ zDYY`b+Qoyz2Jf!#$1rcz@R85WpR^fV6Ix8pI*Ba!==erP2yM2wvDqoO5&4W z_Bq-R)XG?Kh30_%?ZBt?tjdu_F3s}wjaTcVZa2%5S-h1t&Zv~7FtK=E#-X7zc=lG{ zAW1#N$?`i*%F8)~$*rk`ApQ`jF{RSImm(~_k@2QXjNZ8=gmn$?PaU6&I7+DbBlb!g zi{@)8J90a^uNSBtGSwAtA7{-`-*_6`rRgE_;#MR-MkT>N;ojwrm%X_sg4mN2l=-8^ zmrfG-KBBzW(;4KhO(Slw#5R$Xr=rB37r(IS{Ze+>$z|fEyt$nE$y~rB(J4%b+b`P? zJ)ah(Udxcy@}=d9mE_bbrxjhk7S<9L?wHtUOht4palUS7?L8R7b6#t z8ELI2f}KnHTRoBsnN{w-cdhgnSOmX?&Xt_LLNpv>AfrOf;E2qxlNWH>LsCu5i;9` zrgjdJTMgQRF_B?Rf!f~~YDG}*2puh$@QRL0dYi?|aev7qm9#-lvFI|*&j25nnO8{# zW*!8YK*Nno!cNJoKD;nEZeg-H;%IO2v+LcvG#^=LYi7_=jSkPHh!r-&e9;ambKez~ z`LjO-dqqiIlPK9WRUNMB68lK-t%FFUNT6iJC+xV!q2rgvr5o}?EJJ!gj{W@G51P0@ zh9m+*!B+y*rqu`2UVdndf!=iteBI@849VWPpafB@x(@SeqP-(s*k`Stv`W&@(Z zGGwxeLG9ti+%xW{G`|gK={6G=xRB;23;DnUpxJM}r8)T~JfLrto6}HgBWZaPeKwVh zoIy!6;R%xZBKi23lZg7Fz~qCVa~TMkjrrW@gH`PJmMF8{euGy& z*7@J+oy#E2vt;%icPRaiZFSXn9i^CJvca2TLDqTb_zAvFdu*awWXoz+#RfKKc>E+j zl4bH@#$3GG7mDy=O)!c~8(sPFO61d{_WAl4x|Y_v#nJhmbQ8GgUTqJFOvrzxOy68bsgyMHXElZHbFqk1PgPVb*`MA@ z3McqhcXmPW*|4h<&pU+ORKwc@c`5(UFw|4>rSxeAz5DzwpUYHuWB5m+OQpbsqIB=Q5$OU_GTQ0rRr+96P;fW?}NYb2OYl=dN($WkCw&&w=+F zIiv?Lt`D@0DbD3S&Xa2yp2K5)aGkz>PF_7@v}iBZBWCgF!v1 zEanSqPn6m7jb^L)3yJmN9Z2xS+QxKc&bpG(^4BvT#kfDi)bF{UXcT!>?U1uwb)VX| zb@(fPTe(Uht=!Sm!xOG`ll|3ccov@hFs8?OX+LB8t+gY)s*i8>v^cgn+C{2O`9#yo zy0vyA)L%WWJkzMHI%56OJtx(?C?M|wW#OpkvE^ImOtu0Sy-(a(79ya6KZgxppjUQ< zSfH#h9eFUaJebU7-h6$Smc9(|_uu|8#W5z{o%tEBkpVZ#Ji;}JiF!3rtYvO(Zz)LE z64>UFMDV$} z2!uxqQdBFHUwh!DdPI6#a595-TcAa;a4L(@-f=2Jy!uQ;Hlr>4^_;ARsnZJ%#_jqM zZc1X5&TAKw?yN`mzc}W!X2pcgHsri=O+e-)B@MCAJ|H>YRlAga0j3_w!=lEs)$B!( z+%{%tb!5A{R`XQxqIOO#@kYr9getSC;MFnzJD}TdC{_5y1^mQTt>LAjl5F#n98^S; zm2de@MIIygrmxRJ-g3|DHW<7g2pSYNUy~qBrWaWyH=?zaX$?&C_8%iwrzM&W>1m36 zMQ|oh1d(Fl=W9&oB-KCoXuCQ|YnZemV)~;4IHuH{wJ-d1FgEzRAAe}lz`*J6{^^|8 zN(E2EFenkxS&$KUIoEZD$`h6MRTLHuMx~CA6676|gg8aCpTIH>@Y#V;G)~vA^YAE8 z$_%A45tjtgc^>B;T?lvb&#sm{YxI%1GBD>vU$gh4uXJx{SO*kU2AYbWwJ!TPvJD>V z?EgOf_Fc!F_cG;$_xT8VJbNP(OH)}M>3_ED)@tntfBfQ6?DJ|{wNtUTP2!hCuADH_ zey7bHMZn?x10|)gl6WuPs(>SlXp#|Exi}u-VA}& ztw~7>_ryXiuq>PuCrdX6=ZoJic2INl7Nt$#Z;FlaHV}xoZ!M-L#Su;|l1;Dr55AD~ z$Z(W?rl)n1D#m$bLG>1;Jr`?+FX#tf=W|}|yaD#ff~p8hCvCbS8?AU241Smp4{_t|ihJWh`YB4vqd?^;jfWMhrJ)Xl(SV_Hy3{ zy^qOfdqYq;QTHMFegtFb0AqB|fW#X%a{`L`n0F9uSOiJeFrE`V^CH_ zgE_#(hd01isxPYRsg$+%&r*E)`VO(Y-d2MslB`6s7e@-Om4snmbQoApf!AxvU zgv(6H#qky0-EKAACd*SpemK0NV1K=7w+Omh0R5*a@*k_CqW`Try4&{PQn69RnD1=c zhyUte?5l|arL2EyqPV)&{*vgy65;-$^{-MGuI%_{A*C+X>c`d11#&7kso-J zNeWB9SUEA)1uyJ9Z|vpg-deN_7k{UP*RXo`Wt0;)|2p@mX!p&&bH}1bmTxmhL#-~l zkK7FtCNX+Drjq6Q`g1KOabEn?5WV2|mQHF#O?@o&^+XY@V7Sh55&LY&)mDxIC5d5- z1Zd6k1Abs*jVko zy2N9_v$a=rKqJo~(awuffs(;>pMv~`=P*dxP`Gbj+h+UP>Qk8?ZsQOs%i)-Mp_HS#@(rY)u-8T z;)KuWUCU!0LcSMXNR~N+kML?VTy_5P0@2%~lBHj?R+$|V1M8%E5WB?)MZheb*JSUx zw+78?Fkro^BHfB+Z)HivJoKxFc=BVONLX-fn<$2kB{-Ei6+gEbE=f2|pgkY2`kATa zQuf_^$%4YME=G?4=ZghR8DSKaDZbp=Rg_-QqgD*dH{;1@UpS^q8d^SkLQ|}@pv6iO z%v9)qu}^YsGVn$>I42<0Yn$jZvTJ&UEuH2FEscN@?MTh@8>*uH+I6ur$N>*S(^P93 zv)WKzU5;>tFe?G0+!C-iA=3*LGeeQj`85yApK8gf%hocTf`tp9Sy3j^%n!xhg1;vj zcH}&%cac%fEa2QokDrZDkQ{Z$Ir=%}A?a!vX~@IN#r{b`h6=egk45__S}&oEFUNGC zRaNsoG=-RC8XK$jgq~auJpvId58xqesZ_r2H+ZQ!)$8~h^RLl4-jCB?8Aj*%Kjw~& zw19nO;=b5U!<&bcp)%(Sdzc1ws_BX(WpW{Um2v!DwHBR}%EP2}uIm@`@4aU{!(n&s zj{2B_%k&)%ejSqJS0@#vb^TBmGK`gcV{ z(@iUPlfgmlv8v|Qsj7h1fGSsGZljHpF}GgnULKVtZoM&~oTd_d!U$W#iw-GlV-au} z+o~HcRcr~K%sTy}CsA-Z>qXRIX9aX;_BktaEYOhQKwBH4jg8U9SFM0eRu_Ry1a>yY z4n{!t9_Voz$t#1@(02B?4m$+K2?yzLh&Y&A83X+%2_#bIB1qA}zz*vI0uGYwoKSJb zUL@rb15oA%;86!69}H3WmbKJWvP(0fRuGT43-6;2$vVhK=#x zhwR%4;B1FB0s31AC@J`#9}p6O0CsPffc9YAK<{YhE(o&+L%_KY!N6c#Uk&&3dmIFe zJEC$B2IB^LT7SThD8!%mfKDS4(DT7(0YMMJkhm7sALEcvz+wIg19KnZ55a@mP> z#vji@kcV^thr$oxgTrt;A^&(5j_cq40Rzn95DZZCAs8-L{rN06@{q3JJh4Pax$pSBA| z!omLnJ8aKLTvPN<8py+QiTW2?L%|Q}4+6#QlR1bF0);{UTtVoceYL`0sUwUuq$fE72+<$fb9T$MsNf-91VC66ErsuHw0n= zg&P_en?Mjo5MwaX04_lK--qmG&-PfrBkp)72psTpq-<>B@)8G*{T(=?_iJO+?jgY6 zr}_Xl{m-#J-1ezM2LcrgaOVX{f9;l0vcckfE6@i2EP^wPZVbs;=mvY@g1{}fWk$D#UvmIQHU5ADGV3j1HKVRF$f%v6ct5? zfRSKsF*uw@SRBa>g`vbCA_6{ARQ6{3eqhNiZt?@ zjmLBJc+U5Ep3nQ&%l0#~=bkld*1Ff)GuL(9_mW;kQko6S&W%Cex%zo^tMJRSkv{$+`6*=h|>%MPxQZ5^Ey)jjJCW5^>ZnWR(HbWY) z2QROKs1l}D*{}VI-s%<5(lZ0a*MehCpplxX&A+Gc)iAcJo7lr2QI4EpBWwKB z`(5ZPCQ;0t!_4y$m-PYrI8Y`5g}KQ+HvVQP=7APbqXdW2){@ZG+Er{rSdwf(L`xS) zOL=MWB%E9a9D64}ix8)|QrLb3X&Tiz`C0A}`z2Sis*;ra=S$Z-k1u?o9tW|GR3#in zwWVzs7LMkB_z?I8b3FK`0Z{jL20X4}Vs*WBv2b()aRI0AfjI72xH`GJm|3`jc)lAX zoE+VN^R6J+00IT>TA16Ih&y?K^f&xZM0EX<3`@pXIG0-Y5PG)KrZXiA2 zW|Gn%4s{DJH;@5{L&C}4$wkfC#LNN&n-3DMU=ZZGdSPK;RseUrHmj?HK~R_(gF{`N z8^m{A0{9Bt^2b*m5dV*_5D*yFqI39=AI)k@jx7s79KWc z7WZVt|D1hXunzs>{QP-NW~O+mt4$Gw4!7{OjNsFP=nu(!&_VA6WC@W}!{r{SAcO=l zct|2k-^63qG*@qfmkFgcle&rdIMDP1`e$5~?sip4snyviJM_IW$d|eF%O0Kisr9-K zQ&xL^9~$9QE&`F>xM{;b;>ppzxx3Y_k54erd5l1U3x7i!zHQOc5=rJn5@GzJMJO%p zg=}}#(JuQV^QLLm*D4II@2~vhX?Z2+P~b46CZK{38HiUAlGLiYNV%?%`VW(txkA8P zL^#UsNSk@GqMB6Z(&-oTqL6a*pU^K@<@usow5ZFa z!>t}Ojg6f=RfOZbnPxP&&X8jt&Lwq|bT|An0vuXWCx5fcjmT4e>nu300;`n>q;NUY z0=y4TY*s(@TAm$LshygFS&};LE{1t24j)!y#c&cos&Gn?f~VJUNVAULDWllg(RsgS zefoN8p6a36rBJWWxyAv*_HdS1RF=-X{dOrw3lZEAoo)tYRco>aT(tv==;~sr@*W88 zGcsKD+|w^qA@+s|ol97Zn33Dt_O9^RcI+&lc+ax~o%&yg@E@lQ?4D~16cCF9+>#^Y zwvbvaC@fHr92~^DkveRCd5n9t+{SP7wy%TrX?7d&LGULH)lW(2LE+X-!GZzWaqQ+8 zLH-kq{v=RFRMLl;b~mlvjo|!(`Bm!WPsAjiqWdA9bv_qyU^#dlIG0^*|#I$$Dz7C!{7;E=R{yN#-@10?Tk1F zf(v>ii;I|o$S#XuBZ~A&On4FwLkv6#@A25y8E!nlX9P(p5bBJgkEGRx?Tm@_NJbPR zq+Nj%elEZUEpQ)~FjuTNj_5f~n)rKMa$PiU2{9GEn}}T!;c>VLqNgJaFGy}bF3yoq z5jwo(M%IXC@c8T*>IjGj(@8favV$)N#qs*3mA5@ZRvK2E0&qqYc6J z?$kGBei<_<)+|TCpp-&Ggn)!PeB~n1jmmfQ`7~tWWBK0i>Z|Wo-}z|qk&{w}_xbr);zrjY z^{19YbPIY5y;Y`FidC}Js{VT@Z#a!lxbKnXM=7k$EMI`B3U|FD;X)723aakH-t3ZF}*K+HGQ3@r`lQC{kTW@L zcTnAuzABPGr>uOZkZn}>`D}?$nc0gfZL2(`sv!b{1a>`Mu{!-aHBWJ61rBWvqYBl% z9AVY$vh3%tWnSy~_qUCJJ{m`gWwgJ@iO6|vl4X*&Xm+bOzGU=vo^GB%H=kMU{fGnM zWj%(<@JzV`x#+&TYLnx0>c)-3)x%hJYt{@rC6K0`+{zgpol@!d&-gn8dwuh~Gfr7B zg(9^hGl^!2j97PA)!4RJsdbojkhLAEH!IINK6a+t8JPL?gxD^dRSgx)uGN$kvCVtD z-tiMiH>jK?YEo(%KDu#4d(LnUMimP63T?S@Jn81a)D+}5fw%Q(v3}?vY8jRB0prUt zxx)EOogwBS?4dkNEaH9QaW)c{(ff<_6fW`X=Il{MUd}n=8q>+kx?5+1cRxh0dvrQ< z&Q1zVUDlto;3(k4%S84h6(+8e4PzRHrB-OgYgN=G9GmXh?iC8(5e^rQ7A9!*Xf5&M zI5D|Ixb!%i{qkXd{p`cJG7=wBDf%aj=Sb2>nyA86DbgRO83ch zNlnPym3knRDy5LB&8kziA|0h01u76Cq9!MNKx1aFE~70VBe8a!8#J|s4Uo+qq#v#duS7@SyK#3Q8! zy$TwYOzkuXHt_b%9CJsFeycH8ndFF(C}+Futq-mYXM{HV<}OE3i*O7@ z27UN0tuM{Cmvb30ly7GhiYB}6~_Q80#D$FW2q+WvKD*3cRxdVB)XqZL1 zT6$BaOlCOhG-}r8BZ?(tCqb1o7JQ7%^AUDsmKXnR9M-*b~38t$d=KoE5G2x*^9x!a&7)N=*#)< za2$qrVYn24#&Qx8;wG*Z<{$uj!T_wnpWyXpEd3p`tGT(jo4KjGSXkV1a&iN4D1ZRI z^d0s7dDYKU>*uSa>@6HX9BKgWbux2zfPsDv6%hCjoUZ_Ybntf&4!#E608alINB?Lq z46pxVr7G98{lOActUxfn0q_N&fxpxPtNRBY|KIB2`njH;Vg2`NU~I z1_KSUF?WUa^}DNo?jnp;{Ms}L6E_okC#&yN$@K?Ex@+QMC+BGCbWKuV2f)w)!(str z9vr_85XK+w{fUtL;9)wAy<^Q23 zUM`qNetaOju$lUQe<0kz82(n%UspmnIRSwIHkAQ?L->Fx{?ov~n)~bU!<_n;AyJI-oy$Z$}k2ATqzE)WDGqveh3)E#={2&f%*A(0glSc3lLyAU|!jni8)%? zTL5?E5OXzy{T%rrz@@Mo!VcKDfL}I;go(4Pg^iW98wM{hm4Ml0;h+hWgArDf>%rsV z118Y-A|MEu_j?gp5Fe~o53B330`c$w_PZut-;cPsxF8tUEd+EJ3of8tKoelJ`OgOk z%<#h#-(?vvh?|EKxYHjZ4VaG)Xxl%@I6u?mGA#ppaveh9;~0O_^`5lJq1f3@?ld|N zd?yC84{~EXhN0aW&6=r|HYPI(9%JJUfn>UDHZ@K7C9+`{7nM$+hx)8+0=Tet+ z#r0HQCLupZ3#ZOOJ9ve^1Yf2yfSD1rH>6nR##@T<@&3?^>yXg#jB6|Z@u2x~{+5Mp z&fNs2x>v2f(WL`pF^lc}(b!!nQ#fD`qec(!+4w!Y2K>~DW|zJcA!=N2>zZcY(Uc8| zD1s7|NiKn?`^N5m%NBYbSJ-At{S$;%b6*&HGq(mf1kT5x=Bon(TNcx;5Sr}@gUkL9 zwmM^xJ1#ri$Q^wz0vhT z3Or);?JVlT#DsUPWv^fKkpE#R?uJvln`9R5r3>Ty{d4fm zI_b|`j)}>+4|cBO4~R$Q*N24=wdADwD&A^LMdEtlN8k}YO;T4<&u=H- zc%my;#FYG|E5b!w$Dmc)9b5MnLw|BYxw_4$e#d1}k)~_D^<%bFd6~r?`U_+9K^A(i zlZ0jXy!>~b#w$uh$;CX?#t(-|9fzb!8XE4YYb5)AoeR`y;U(5kpx5>$i>Eb>Z>6jv z20xJIoHu`<6NFRGq;dNVBc1Sb%{`6VqZnK-dD3;wR+!?7D&tYUKvMKLyVRA`^2BqB z(j)lfUH5ksh=|_`(yB_)eTx;7cRdF=I3i(869s*+7yEd|EcWq;Ck`bGOe-)$jnf3mh z1R@D4C$ZVs%JLO0>18ngY$-BZ%MBfxFJ_n6o81=26Co!e-0xZm%l$6eOV8nIdPcpA z`j&j(`(>?dAhd|#O!7@HkzW|t-BPkK-R8Z3*L{`RpR1f#^XXIGvjUI9bhfyM#5+1Z zXb$HFhvx04wdilTopRSVRC5lk6k^`6-E?$855~mYk7!5W*)}yVHYwY!<_pC z!Q>e4BitrDw~UEY%i0tCL6>%!gH2g{RGM_y~bDCZBDTmAj*vhQqm+aWKC!O`N{3>;*t zc2GaP>KWoVDG}g3EdiR}Wn;XDms%gi7xKxTvZCQh4d1w7Atx3RnF$|yz+{RSt#u=b zo*U@qGxz}z+%nn`VK?4BmLt-zu^@=ZKC&<_`CXClTz(C>%s)!Gn7~Yb%V~L0_!HwK zj>lE6KqyynLbzw|!cQ^B#)#Jm3KEr(PLY_EUYDyPJi~d6k9I4lUSf&Hepniht<_sp;u-n~`pCLi zswl50uh=|F3vwf(5Ak>-&sM?wV{hut)CE$*^V)-`I`Dy&@rfs{QQ!mWCsZSx))?w= z$)a}8v_$8TW)UvY+=vfZRJIa=ZIlt!ur1h}$5jH_J9OChF+)GXO_Riio`I~q*DA4> z?S%SlG%}Tr1gVCZ8eF-j@}K6eq`cx@cfnZpp&<-^j?rkyNsE{FOxb`53x}9p@ow2o zX=w}=*$7qpJ-lO#H@p0Zvh6IAKY*GB}$*mRnG#~-dsH$cN(PD}b9~nq6ghMvd$0Z4; ztUG(2MTzp zh)7X>ge7=F=n0lol-F^_a!;rLf`pDAXMRZY+(xy zUw@>uJ3eVzdW3Bd_x1ChYAT<`Qm?Quu71#KdLOJ9Mrj&Pnq+!j{wSvL zd5?l1^nD42xf5dg}lcY?6r&2XhXl(VBd34&L{Ow{lHspoe5XY15 z*!5N?$9)!sl8dv3y4L#s5A_e~f)cSmp7J)e=n-Wq=s%dxPkQUOvKz`u!;Uz=_T>;k zX@$77!V%8_RaT$sfM1xUd^l<-wPJ?JskDq$#Hic<-XXPqNu$;8Y~r7>3jhZGBU+UF z4``7KhLr#uukNJbXmcH@;ravF|AY|%Fz_cvx@W`Tqfn{yvP?SQLPa|9^s_*MR?LD0*%9;fNnl6wJ%_ z7Zl~=0l@HoM4}pZ6~|O4u3p!k!xeC~N=AIN6b~bxx+#(iw(N{~Cdu6;u_dPc>oETC}JG4SSrEyk<9wqOJZg`2MePlu~gCR8JzwV4rB$gVT48Ump)_ z`L6~`C7PyR?nZplOE!2Sr~)tBxI^}f0i$$RtTgR|sWl#Y*mc^rrs zgfbqAWjzV$sG0)zN{r=yrX0TE_c&^3@b1ZhGnPladj0b2Y~c<~OHBH)0UZCp?TqXu zjia6QxU^H`v0@|3@3$r6*zGyBbb8%40`4T(BOhkvCni0A zJRZ@xOIHSR$(gHB#HZrlz*TcOj>t@xh~}(}d=Y&m{fTVcYljDdP7{H?#}wmi+nE#6 z{4L>SXh?@G4n_142YeZyM+rtAN{Rd}k+%kuQBy zZ~N_5s(VvzpNA&5xrT;|2c%+JihBl}V@5yuAWU8)fNlhWLu@S2z_Yc*Mh;U`Cm39Z%ftskX5hr7xUcKmuaI_&qdzLkbLY}v(%7Sm*H&n+1&0_6uS!Qo#6xf9VF$X z+CScXIv5p0en+T_Bs-{mL`R^zZDnv9S)y*P`_oflZp|lk82;0PpMu|ftHv_$^K@e{ zJ4xN`BH@3q)-n~iJNnf%El@;3>}vXn-mzsI+f9b@>8QHYXX)hbkxx(`I?uIw=Qf`! zYufqHZhIbGxQj-KT)krzsVP5W{qS~uv91MI#>niiAyRNPu+d z7@;1aU(^e3_;RkwugGC^^gF|4^!A&h9fNpi)!AH_^CR|!N-^RB-v=pL&0tM`3S7rdpKx zox9?I-FZ=zUhhz(jSp(D69Zm6n80U%DE?adpVdT+f=@iZu($gfOJt9T)W zFkmMz=Q6k!gsdzkiIiDRY_}eirmHi`^ti5}_;pG$WW*HllgZs(^rOpPJ$tvC`$Vo# z0I!F2$ryd6%0`mUAncJUZ%+&rMj0eqMjnT}$*6&8TIVXxU4)csg@R}8u}v_#j~Rk2D+h{ad54~Af^ z=*&CID}FtN)N!6bUWW+2L+_lUA|;c<1`*sfrxlW6rhl)htE+gk75ZGOSWWjRfvLtR z_}ynV)lDYL#(;aI`~pb%gc)j^{_r_cA?HvNuQ_9{hXf%kyqw#<;0}$h=6G|+tFm{b z<=%2wO|@^FcTz~oS%51|cwNc~upUHU9XGhDv+8mRUR3~^L+Rp`Lc^;Vmm}0Z->9WW z{e14GSh=a37vVk9ng$}_S@f*Z^B)QiQ|0_KzIsAdr^Gzc74{SQI;mVoDHU)P6Kmvk z5~q?U7zf?$A>GQQ_#(BhV*Tl9yFWZ!ugI~|$fbUmStx6dM_Co2_$ep>@y;{$yll3; z+7ac+4c z9hJ~GA4#}Z7<%u5-^Aj~uznh&rHM4ddyTW&I0H9@+=M(p952Za5w*3kP!c%)Aah!N zWH?tlHjp+`Tk>|`qsP&kn_WpYog7tq9!^!8kv=;fPUD5p2RkLH<@@Qp0^+DXf$dp{ zIVo)~hMsSqrB?&FDwpAarAs7ZR# z6l1Ce>!UU(aWbOSF7%v{y2=OeBM(l9X+l>^3X1O@4SZwz@?qLLc?=VOxuSB|A?t}z z3r|3IV@axj0ZcCj?FGm)V+L$-4c<)EaEii#v_tqW_v}r7$IHP)@-Q=Tpz8xW; zEPy)^S=_@GUYx4(HPF6glU}_gC@@jwD}J$|_=Lg>A9M|loNFVj0ASQ773S!FsW>Ca zKY6@t7_<#Qr{Ftfx5pi13omqDemr%^c!!TxeH)tbjts4usW#a7v$^uHHW!Iv%-6%G ziYTPV?lOe*A9udEcL`=`r&f6Sa*|L`?Cre83K4(2G*C6d|Gs=w$$D6X=n0B-+}yK{ zI1=H!s2k>lS4&M@RgBHN+(M~onPlWbPx!NKc15^_l&;|XRETl@dyC(lT{`ejgOMXTq|Hbz|f*f2BSpWZ*59C_zzYdgq zzYeGg#=v29{Mi3hA1qk%3nuzK4EP_2>A!mU8Wr(Fzegfqu@VpvE8*w<0~A33F!m3z z5;jVJPtg({j6XpU5EK0yDEb~8{RtHP9I5*g6hVKEmO!8| zT=fr0ufIe~G@Y#VG;WFs~?jh*Eh@$kxe<1kHfLtR|=JiNhRg>jh3Udc;ck7)PuDr}rK@OlO+ti$Cs!7<}86UI{Z8 z3v)N#{Cex)zTO8mP4-KMLt^SHJL%pnkv$Jbbq9?Nr%xX%`_%XIcH&!mAtOqccizWt zIvLMHUgmivn-^6FXjVRs&krbjWGIKFDJRo1F8y_dk+C!ryX1MHl5MW;=Xis$k=wOh zMUTaw$CVh5#!@aoba`tp;#>P$L`ta4#h7#9sXoZ(Sa9zl!l7|EJeASy>K zPe3Hf31CeNA;<RlUR#2t>ybQ>`$i3r=c{QlCDy zr;Ss8C*SCQyX=sen?Q}!pp4;VoIE3WGS-g5;8Lqdj%BW#(fud#3<8TXj2Fh3gJW`D ze1`jQMMY%1CeLxOnMj%Wo_aCF>%^RLjcUIuES48^_2!Fjp!ibMM^}o~C5!j6SD3mA z4@|!K(lFup6^`%%ZJH8h_LGBW=?zbn&foP)jv|8VUQO=vn)mzgR~=1)U#EP z#veuSC4f($!6--cOJS0P@k!rA^4CnCRn~w6tE6qFByz{kKuI5|!tOl)ziVQ02U#{f z8`pWxvpJv=|L#$`PHU*jnQ};ElFXjOsY}-9)pM2IkJ9`f_ZVt8tKk!Sd!rkso1Y50 z3DwcG9VJ8@XN9+RTZpAvTVbEkRE&<}Y@~>&Qq80xsm(%NTcA+;v9O$%oX5EnTWP5+ zeO%it>_aFqh>s>cFYG87Dou+e-1*f}&)v2pJnC)jB8(<|!~LsInXux@_NxL~Q+pMy zSEZ}{&D5$CFq-BBcpt_X&D!_S9?a~_upJ3`ju?^|-%CL;Ik`FVDkjyvMN3)4^lA@| zdh6@r3r;ouO3Y7iX90Aam`{-nk%the5mO&IJ$eQ9sRc&^pxtB=XBEAW{AY&x2s<%H_bYP z%S+Iol*kWYK_5@qrl_|x6hnw+Ad@u%LrE!FyJ3kL_E{>$1&`mRpn4>c*%M}Ayq;y+ zV70BzW5=&Vw)1iyd9k(VQ!=Jw<^Pu8NErV}Z|19zo_}$TV`96$CP7Km#Iw&)f-kn% z#%m(xh7u6wq!7_CND?&Q2=Jj|P6TUYlrF6pT*aWgf8Tr-(TGz;Ce?sDWt+USy7dy(7GCr$Icw*#**nuoyvzFviKhE2UFe} z%8|WW^}3W+aD-yloB&nkV?tqLEC-c5k9)uS&bPDCRuyvmMLyK9hZA!TZsoQSua=Li z8sYYhAluC$8h!}BbiMdAI5@dR=}SLV+j4v~Nba=5afEdX;u+d?a=uX8Fro*ojv`Qg zxA=hl^LrNq0>Md)mCbnf)$}7@vo?~H($am~%X1|u{l#A8{xB|E>gfA@8X6qa7tjnT zc2}+YM47d&m#1G4^-7598uagz(hENyo1S543Ws2WY!3Vrn$VkBhQiPF=||*F%EEFA z9v|{V+d{GYX-}(0ANk{M5^6Os_KnZnv%T-(V#?%xr>4pOzNw=J{+d5xfcb^-?||nz z%l!ul|KAXg^xqHpA#sOLM*_}{m`C35`0 zC0u~W@Z;a?6D9Q%Hy|P&?jxLx^vnX6rl8}cX!HN#DeJTL6XeRFly!7+L?#4)6vr6=5`ZSbV^p0|jD zF-0q93#Z+Xmaz1+=lugah2(>~)g8nfZ#CDdD7sVMS~cxrUEb-(<+WdHfzRsj?4Zp; z49{tlVB8AL=6roF=4DLo4NM-=#*>Wpw;SROQNoHP`$Tz1+0KpFPsTf8dj>&}ff3KL zlQB|VDES=O%`khyCwEE42b+$pRbxKZN0 zz3*XxS{+hkMUcpP(B~{>o6%3%PpH|A5WKA|{LE(Y;7v!7{^Fvx=)gL7pmCX5kZ3eR zczU6#hG5vjHTBjcqTY&@R#z6u0kmx2*&R|DA&V7RPDrYO6{bEIPrEFYlGz_Fqho-H z_PBh5?4mc;WN-6iC4Poaa5RV!awWkWy^~w*(W5$8_V_^!N`61FWYMJsQ<*=rJf%R^ zH`IKklWp9jMh~X9C>YfeMdgH=-y&7cUSsX4Wck6bvM{W&a6Fr7iFBcO?SC)9dqjF5 zLs0Euzg54o*HhaQtO?Gd3(2-h$~PzYN-2CVmr5}ql?fwRoGm&pN!B<%S>7HTaAH_Q z(pwlkvf#eMwL1|)^u!~FUy}trEnM~Ax7{t^Yw2f|>Nj3Od*-yjXTM!DvbS(x6eLbHPcEv49Z*tm!x(oGXC;7t z;kGm43HrY4g2cjyM=lgicOq8wQGMjK_vmIf$XLaY-HV|<284R?@5sHPli#hte}p^W zg{MFqf=dn9YGYfVa;Ek|V(MAkF8p+XzR;61!n{o*CZ!gGFBOk78v6#Xmgecc3PuHy za^&+j_%F2ERBw0O>$9AjG+u`a*x6gJ6`A`y15do4Po*;1_~yp@LGM{BwCQC`5l=%O)=0io#(~7y6@wKp7r3wdQ^qYZeqCG4K^60BM-Q2wTa@6jn&8wGccJ{WPgY_q#lIM30 zD96yN4&NAg(XxwVM6B^G=$gfuCT0pB0?5_|0X%qcainr4W867j{GmA3j0qWeJ2KK)RizQAJ;M6BlwOa(2Oi=rO=$kz~{Iu~hrMo=b z+*W`2(g1OGsMlDAb|(re{G(t}%&xbccfFmh_6OZ;CqHQV$c^nC-3{zJY`U)gshv!EK)i;&b1lbI*&sy@5hWniL=E=N~O4 z9do2Os9bAgz~{==Z0XJ`+1RXZ!`G>s0-b?h6n_QZ&+6RB=>I8v7m z54$Fg`l~OycWT6T1>e=|?S!XPG5W-wOrlm4NQUjI6xOYZ*1B#frHIU(X|bO57de?L zvr67MxMMv*V9xbuGMK0ln{6i3gHz<$2W}<`v&{#^W+pob{;uBMwqHmeGA){R`%Gwa zB)Qcw%r#%`W;)#ESv!D^YvP7>Kh?rf@->!nUgGA9@3Xp@Y)ltcifQpyw%6J0@r32! z`S~G{`c=`Fy&F0$K%gLY>ZDgUMTb?Rjw8kuwa#AHY**B1e15wu`n`kS8y|ba3686U z$yWI+y+y>%gon3oH_PIQs@K|4ie?Y`FswX{bG=|kWq4%PMha-1l=a9nh`Wffk0Lbj zU2f>+4C8GViyv619ay{wZ7!qY8ka!912!Q(y(&1Qn=}$a<}X6AzQ*Oi&H%(kuIwjuleLnkg1p9T%z+s_&8u<@ zv8EvPjlMCGx`9hsP(k8K`mGT6v-C(h?x^AYyPZdGhm5ie(lgS8`I-c$A&q?gK3DG! zFJ`a`F3A@jmf`=7&$)h4ByHS)3|}S-2WM*&R~uJ0H(&uQGmuX1;$-e_1}uSQaxgJd zRs(5TxVYLlIRd%->^vZSCUJKgAP~yMB@Kn@GlNv!O$SK*1%S{6td0h9 zxj{_VS?@rN*9M>pN!Y?xCdoTooLtb)e-E}hmzl;2|lGn?{ z!VIZ2JxGX^S8cnLwJ7g8x*!y<8SrxLILRftBr>X0N;PNL4Z7$zuBPtzn6nT`GBbH zul4bA@%(1vOdY zTuguzbeHS@6hO_!#{!sZz!G%W!WMwaT=UQCbr#n@DS(X1Ndl`51o_ROCY(Hcd}cgm yz>brj#}w!wC!eJ$)Evyk#cd&s@t=!)r**Dwz%T21mIDL{%!@%!FQqJv@&5oEIg}m% literal 0 HcmV?d00001 diff --git a/requirements/base.in b/requirements/base.in index 736dd1324e..2a0558154a 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -14,3 +14,4 @@ langdetect # (Trevor): This is a simple hello world package that is used to track # download count for this package using scarf. https://packages.unstructured.io/scarf.tgz +numpy \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index 62be4fd36e..0cbe2afbf1 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -36,6 +36,10 @@ mypy-extensions==1.0.0 # via typing-inspect nltk==3.8.1 # via -r requirements/base.in +numpy==1.24.4 + # via + # -c requirements/constraints.in + # -r requirements/base.in packaging==23.1 # via marshmallow python-iso639==2023.6.15 diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index b121fa5298..b015ffa6f9 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -6,6 +6,7 @@ # numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # pandas pandas==2.0.3 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 3fff2566e5..ada01fb2a6 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -95,6 +95,7 @@ networkx==3.1 # via scikit-image numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # contourpy # imageio diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 6c60f99dee..aebc5d8dee 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -74,6 +74,7 @@ networkx==3.1 # via torch numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # contourpy # layoutparser diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index d07490c0dd..5f1a3d6517 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -8,6 +8,7 @@ et-xmlfile==1.1.0 # via openpyxl numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # pandas openpyxl==3.1.2 diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 3bbc556b4e..03fdd36ce2 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -50,6 +50,7 @@ networkx==3.1 # via torch numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # transformers packaging==23.1 diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index 2c83b64e9a..d3c45a04d5 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -12,6 +12,7 @@ fsspec==2023.9.1 # -r requirements/ingest-delta-table.in numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # pyarrow pyarrow==12.0.0 diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index c6620e6579..a2854493a0 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -41,7 +41,7 @@ idna==3.4 # yarl langchain==0.0.298 # via -r requirements/ingest-openai.in -langsmith==0.0.40 +langsmith==0.0.41 # via langchain marshmallow==3.20.1 # via @@ -59,6 +59,7 @@ numexpr==2.8.6 # via langchain numpy==1.24.4 # via + # -c requirements/base.txt # -c requirements/constraints.in # langchain # numexpr diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 7178248231..d9540344ef 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -875,7 +875,7 @@ def test_partition_pdf_or_image_warns_with_ocr_languages(caplog): def test_partition_categorization_backup(): - text = "This is Clearly a Title." + text = "This is Clearly a Title" with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]): elements = pdf.partition_pdf_or_image( "example-docs/layout-parser-paper-fast.pdf", @@ -898,7 +898,45 @@ def test_combine_numbered_list(filename): first_list_element = element break assert len(elements) < 28 - assert first_list_element.text.endswith("(Section 3)") + assert first_list_element.text.endswith( + "character recognition, and other DIA tasks (Section 3)", + ) + + +@pytest.mark.parametrize( + "filename", + ["example-docs/layout-parser-paper-fast.pdf"], +) +def test_hyperlinks(filename): + elements = pdf.partition_pdf(filename=filename, strategy="auto") + links = [ + { + "text": "8", + "url": "cite.gardner2018allennlp", + "start_index": 138, + }, + { + "text": "34", + "url": "cite.wolf2019huggingface", + "start_index": 141, + }, + { + "text": "35", + "url": "cite.wu2019detectron2", + "start_index": 168, + }, + ] + assert elements[-1].metadata.links == links + + +@pytest.mark.parametrize( + "filename", + ["example-docs/embedded-link.pdf"], +) +def test_hyperlinks_multiple_lines(filename): + elements = pdf.partition_pdf(filename=filename, strategy="auto") + assert elements[-1].metadata.links[-1]["text"] == "capturing" + assert len(elements[-1].metadata.links) == 2 def test_partition_pdf_uses_model_name(): diff --git a/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json b/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json deleted file mode 100644 index cffc024be5..0000000000 --- a/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json +++ /dev/null @@ -1,28 +0,0 @@ -[ - { - "type": "Title", - "element_id": "94efbf7307081f8f45b11a183ad99254", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Mission, Vision, Values" - }, - { - "type": "NarrativeText", - "element_id": "f116dc480f737022b3eef55d2095d808", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "💡\n \n Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company." - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 1032755b81..8b6febb9d7 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -5,7 +5,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Core Skills for Biomedical Data Scientists" }, @@ -15,7 +16,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow" }, @@ -25,7 +27,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Lisa Federer, MLIS, Data Science Training Coordinator" }, @@ -35,7 +38,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives" }, @@ -45,7 +49,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Executive Summary" }, @@ -55,7 +60,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:" }, @@ -65,7 +71,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;" }, @@ -75,7 +82,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "2. Programming language expertise: biomedical data scientists should be fluent in at" }, @@ -85,7 +93,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "least one programming language (typically R and/or Python);" }, @@ -95,7 +104,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;" }, @@ -105,7 +115,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science." }, @@ -115,7 +126,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." }, @@ -125,7 +137,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "The report further details specific skills and expertise relevant to biomedical data scientists." }, @@ -135,7 +148,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Motivation" }, @@ -145,7 +159,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" }, @@ -155,7 +170,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" }, @@ -165,7 +181,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." }, @@ -175,7 +192,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Methodology" }, @@ -185,7 +203,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:" }, @@ -195,7 +214,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." }, @@ -205,7 +225,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." }, @@ -215,7 +236,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad." }, @@ -225,7 +247,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist." }, @@ -235,7 +258,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" }, @@ -245,7 +269,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "2" } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json index 8fe11dc463..0f1f15711f 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json @@ -5,7 +5,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "DatainBrief22 ( 2019 ) 451 –", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 0 + } + ] }, "text": "Data in Brief 22 (2019) 451–457" }, @@ -15,7 +22,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "", + "url": "www.sciencedirect.com/science/journal/23523409", + "start_index": -1 + } + ] }, "text": "Contents lists available at ScienceDirect" }, @@ -25,7 +39,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Data in Brief" }, @@ -35,7 +50,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "www . elsevier . com / locate /", + "url": "www.elsevier.com/locate/dib", + "start_index": 18 + } + ] }, "text": "journal homepage: www.elsevier.com/locate/dib" }, @@ -45,7 +67,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Data Article" }, @@ -55,7 +78,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment" }, @@ -65,7 +89,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Omotayo Sanni n, Abimbola Patricia I. Popoola" }, @@ -75,7 +100,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa" }, @@ -85,7 +111,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "a r t i c l e i n f o" }, @@ -95,7 +122,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "a b s t r a c t" }, @@ -105,7 +133,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018" }, @@ -115,7 +144,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid" }, @@ -125,7 +155,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration." }, @@ -135,7 +166,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, @@ -145,7 +177,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Specification table" }, @@ -155,7 +188,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Subject area More specific subject area Surface science and engineering Type of data" }, @@ -165,7 +199,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Materials engineering" }, @@ -175,7 +210,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Table and figure" }, @@ -185,7 +221,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za" }, @@ -195,7 +232,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "tayo . sanni @ yahoo . com", + "url": "mailto:tayo.sanni@yahoo.com", + "start_index": 16 + } + ] }, "text": "E-mail address: tayo.sanni@yahoo.com (O. Sanni)." }, @@ -205,7 +249,24 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 0 + }, + { + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 0 + }, + { + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 .", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 0 + } + ] }, "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, @@ -215,7 +276,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "452" }, @@ -225,7 +287,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -235,7 +298,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "How data were acquired" }, @@ -245,7 +309,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Data format Experimental factors" }, @@ -255,7 +320,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Experimental features Data source location" }, @@ -265,7 +331,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Accessibility Related research article" }, @@ -275,7 +342,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230." }, @@ -285,7 +353,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Value of the data" }, @@ -295,7 +364,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment." }, @@ -305,7 +375,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments." }, @@ -315,7 +386,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the" }, @@ -325,7 +397,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "nature of inhibition of metals." }, @@ -335,7 +408,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "1. Data" }, @@ -345,7 +419,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule" }, @@ -355,7 +430,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": ") g m" }, @@ -365,7 +441,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(" }, @@ -375,7 +452,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "s s o" }, @@ -385,7 +463,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "l" }, @@ -395,7 +474,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "t h g e W" }, @@ -405,7 +485,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "i" }, @@ -415,7 +496,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "30" }, @@ -425,7 +507,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "20" }, @@ -435,7 +518,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "10g 8g 6g 4g 2g Control" }, @@ -445,7 +529,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "10" }, @@ -455,7 +540,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "48" }, @@ -465,7 +551,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "96" }, @@ -475,7 +562,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "144" }, @@ -485,7 +573,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "192" }, @@ -495,7 +584,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Exposure Time (Hours)" }, @@ -505,7 +595,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES." }, @@ -515,7 +606,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "immersed in 0.5 M H2SO4 solution in the absence and" }, @@ -525,7 +617,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -535,7 +628,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "453" }, @@ -545,7 +639,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "2.7" }, @@ -555,7 +650,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": ") r a e y / m m" }, @@ -565,7 +661,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "( e t a r n o s o r r o C" }, @@ -575,7 +672,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "i" }, @@ -585,7 +683,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "1.8" }, @@ -595,7 +694,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "0.9" }, @@ -605,7 +705,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "10g 8g 6g 4g 2g Control" }, @@ -615,7 +716,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "24" }, @@ -625,7 +727,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "48" }, @@ -635,7 +738,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "72" }, @@ -645,7 +749,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "96" }, @@ -655,7 +760,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "120" }, @@ -665,7 +771,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "144" }, @@ -675,7 +782,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "168" }, @@ -685,7 +793,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "192" }, @@ -695,7 +804,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Exposure time" }, @@ -705,7 +815,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." }, @@ -715,7 +826,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "100" }, @@ -725,7 +837,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "90" }, @@ -735,7 +848,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": ")" }, @@ -745,7 +859,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "%" }, @@ -755,7 +870,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "(" }, @@ -765,7 +881,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "y c n e c i f f" }, @@ -775,7 +892,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "i" }, @@ -785,7 +903,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "E n o i t i b h n I" }, @@ -795,7 +914,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "i" }, @@ -805,7 +925,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "80" }, @@ -815,7 +936,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "70" }, @@ -825,7 +947,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "60" }, @@ -835,7 +958,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "50" }, @@ -845,7 +969,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "40" }, @@ -855,7 +980,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "30" }, @@ -865,7 +991,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "2g 4g 6g 8g 10g" }, @@ -875,7 +1002,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "20" }, @@ -885,7 +1013,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "10" }, @@ -895,7 +1024,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "0" }, @@ -905,7 +1035,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "20" }, @@ -915,7 +1046,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "40" }, @@ -925,7 +1057,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "60" }, @@ -935,7 +1068,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "80" }, @@ -945,7 +1079,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "100" }, @@ -955,7 +1090,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "120" }, @@ -965,7 +1101,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "140" }, @@ -975,7 +1112,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "160" }, @@ -985,7 +1123,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "180" }, @@ -995,7 +1134,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Exposure Time (Hours)" }, @@ -1005,7 +1145,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Fig. 3. Inhibition efficiency versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the presence of ES." }, @@ -1015,7 +1156,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor." }, @@ -1025,7 +1167,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "454" }, @@ -1035,7 +1178,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -1045,7 +1189,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO4 solution in the presence and absence of ES." }, @@ -1055,7 +1200,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution." }, @@ -1065,7 +1211,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Inhibitor concentration (g)" }, @@ -1075,7 +1222,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "bc (V/dec)" }, @@ -1085,7 +1233,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "ba (V/dec)" }, @@ -1095,7 +1244,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Ecorr (V)" }, @@ -1105,7 +1255,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "icorr (A/cm2)" }, @@ -1115,7 +1266,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Polarization resistance (Ω)" }, @@ -1125,7 +1277,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Corrosion rate (mm/year)" }, @@ -1135,7 +1288,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0 2 4 6 8 10" }, @@ -1145,7 +1299,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382" }, @@ -1155,7 +1310,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086" }, @@ -1165,7 +1321,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356" }, @@ -1175,7 +1332,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05" }, @@ -1185,7 +1343,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "24.0910 121.440 42.121 373.180 305.650 246.080" }, @@ -1195,7 +1354,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919" }, @@ -1205,7 +1365,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8." }, @@ -1215,7 +1376,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "12" }, @@ -1225,7 +1387,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "C/0" }, @@ -1235,7 +1398,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "10" }, @@ -1245,7 +1409,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "8" }, @@ -1255,7 +1420,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0 / C" }, @@ -1265,7 +1431,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "6" }, @@ -1275,7 +1442,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "4" }, @@ -1285,7 +1453,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -1295,7 +1464,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -1305,7 +1475,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "4" }, @@ -1315,7 +1486,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "6" }, @@ -1325,7 +1497,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "8" }, @@ -1335,7 +1508,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "10" }, @@ -1345,7 +1519,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Concentration (g)" }, @@ -1355,7 +1530,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Fig. 5. Langmuir adsorption isotherm of ES." }, @@ -1365,7 +1541,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -1375,7 +1552,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "455" }, @@ -1385,7 +1563,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Fig. 6. SEM/EDX image of as-received stainless steel." }, @@ -1395,7 +1574,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor." }, @@ -1405,7 +1585,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor." }, @@ -1415,7 +1596,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "456" }, @@ -1425,7 +1607,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -1435,7 +1618,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "2. Experimental design, materials and methods" }, @@ -1445,7 +1629,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "2.1. Material" }, @@ -1455,7 +1640,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9." }, @@ -1465,7 +1651,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Fig. 9. Chemical structure of egg shell powder." }, @@ -1475,7 +1662,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "2.2. Weight loss method" }, @@ -1485,7 +1673,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "This physical measurement was carried out in order to provide direct result on how the corrosive environment affects the test sample. The cleaned and weighed specimen was suspended in beakers with the aid of glass hooks and rods with the test solution of ES at different concentration (2, 4, 6, 8 and 10 g). The pre-weighed specimen was retrieved from the test solution after every 24 h, cleaned, dried and reweighed. The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss which was used to calculate corrosion rate and inhibition efficiency." }, @@ -1495,7 +1684,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The corrosion rate (CR) was calculated using Eq. (1) [1–5]" }, @@ -1505,7 +1695,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Corrosion rate CRð" }, @@ -1515,7 +1706,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "(cid:1) Þ ¼ 87:6W DAT" }, @@ -1525,7 +1717,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "(cid:3)" }, @@ -1535,7 +1728,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "ð1Þ" }, @@ -1545,7 +1739,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (θ) and inhibition efficiencies (IE %) were determined using Eqs. (2) and (3) respectively" }, @@ -1555,7 +1750,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "θ ¼ CRo (cid:3) CR" }, @@ -1565,7 +1761,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "CRo" }, @@ -1575,7 +1772,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "ð2Þ" }, @@ -1585,7 +1783,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "IE ð%Þ ¼ CRo (cid:3) CR" }, @@ -1595,7 +1794,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "CRo" }, @@ -1605,7 +1805,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "x" }, @@ -1615,7 +1816,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "100 1" }, @@ -1625,7 +1827,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "ð3Þ" }, @@ -1635,7 +1838,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively." }, @@ -1645,7 +1849,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "2.3. Potentiodynamic polarization method" }, @@ -1655,7 +1860,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the" }, @@ -1665,7 +1871,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, @@ -1675,7 +1882,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "457" }, @@ -1685,7 +1893,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3) 1.5 v, step potential 0.001 m/s and stop potential of þ1.5 v set was used in this study." }, @@ -1695,7 +1904,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Acknowledgements" }, @@ -1705,7 +1915,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "This work was supported by the National Research Foundation of South Africa and the Tshwane" }, @@ -1715,7 +1926,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "University of Technology Pretoria South Africa." }, @@ -1725,7 +1937,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Transparency document. Supporting information" }, @@ -1735,7 +1948,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": ":// doi", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 94 + } + ] }, "text": "Transparency document associated with this article can be found in the online version at https://doi." }, @@ -1745,7 +1965,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "org / 10 . 1016 / j . dib . 2018 . 11 . 134", + "url": "https://doi.org/10.1016/j.dib.2018.11.134", + "start_index": 0 + } + ] }, "text": "org/10.1016/j.dib.2018.11.134." }, @@ -1755,7 +1982,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "References" }, @@ -1765,7 +1993,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi ,", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", + "start_index": 4 + } + ] }, "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution" }, @@ -1775,7 +2010,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "usingeco - friendlywasteproduct , ResultsPhys . 9 ( 2018 ) 225 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", + "start_index": 0 + }, + { + "text": "usingeco - friendlywasteproduct , ResultsPhys . 9 ( 2018 ) 225 – 230", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", + "start_index": 0 + } + ] }, "text": "using eco-friendly waste product, Results Phys. 9 (2018) 225–230." }, @@ -1785,7 +2032,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov ,", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", + "start_index": 4 + } + ] }, "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion" }, @@ -1795,7 +2049,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "inhibitionofausteniticstainlesssteel ( Type316 )/ acidicmedium , Mater . Res . Express . 5 ( 10 )( 2018 ) 1 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", + "start_index": 0 + }, + { + "text": "inhibitionofausteniticstainlesssteel ( Type316 )/ acidicmedium , Mater . Res . Express . 5 ( 10 )( 2018 ) 1 – 15", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", + "start_index": 0 + } + ] }, "text": "inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15." }, @@ -1805,7 +2071,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi ,", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", + "start_index": 4 + } + ] }, "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel" }, @@ -1815,7 +2088,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "corrosioninchloridesolution , Def . Technol . 14 ( 2018 ) 463 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", + "start_index": 0 + }, + { + "text": "corrosioninchloridesolution , Def . Technol . 14 ( 2018 ) 463 – 468", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", + "start_index": 0 + } + ] }, "text": "corrosion in chloride solution, Def. Technol. 14 (2018) 463–468." }, @@ -1825,7 +2110,29 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "https", + "url": "https://doi.org/10.1007/s13632-018-0495-5", + "start_index": 233 + }, + { + "text": "https", + "url": "https://doi.org/10.1007/s13632-018-0495-5", + "start_index": 233 + }, + { + "text": "https :// doi . org / 10 . 1007", + "url": "https://doi.org/10.1007/s13632-018-0495-5", + "start_index": 233 + }, + { + "text": "s13632 - 018 - 0495 - 5", + "url": "https://doi.org/10.1007/s13632-018-0495-5", + "start_index": 258 + } + ] }, "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5." }, @@ -1835,7 +2142,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [ + { + "text": "https :// doi . org / 10 . 7449 / 2018 / MST _ 2018 _ 254 _ 261", + "url": "https://doi.org/10.7449/2018/MST_2018_254_261", + "start_index": 202 + } + ] }, "text": "[5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. 〈https://doi.org/10.7449/2018/MST_2018_254_261〉." } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json index c6ebb46fe7..bf9e4bf189 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json @@ -5,7 +5,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "DatainBrief22 ( 2019 ) 484 –", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 0 + } + ] }, "text": "Data in Brief 22 (2019) 484–487" }, @@ -15,7 +22,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "", + "url": "www.sciencedirect.com/science/journal/23523409", + "start_index": -1 + } + ] }, "text": "Contents lists available at ScienceDirect" }, @@ -25,7 +39,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Data in Brief" }, @@ -35,7 +50,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "www . elsevier . com / locate /", + "url": "www.elsevier.com/locate/dib", + "start_index": 18 + } + ] }, "text": "journal homepage: www.elsevier.com/locate/dib" }, @@ -45,7 +67,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Data Article" }, @@ -55,7 +78,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "A benchmark dataset for the multiple depot vehicle scheduling problem" }, @@ -65,7 +89,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b" }, @@ -75,7 +100,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India" }, @@ -85,7 +111,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "a r t i c l e i n f o" }, @@ -95,7 +122,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "a b s t r a c t" }, @@ -105,7 +133,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Article history: Received 21 November 2018 Received in revised form 13 December 2018 Accepted 15 December 2018 Available online 18 December 2018" }, @@ -115,7 +144,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "This data article presents a description of a benchmark dataset for the multiple depot vehicle scheduling problem (MDVSP). The MDVSP is to assign vehicles from different depots to timetabled trips to minimize the total cost of empty travel and waiting. The dataset has been developed to evaluate the heuristics of the MDVSP that are presented in “A new formulation and a column generation-based heuristic for the multiple depot vehicle sche- duling problem” (Kulkarni et al., 2018). The dataset contains 60 problem instances of varying size. Researchers can use the dataset to evaluate the future algorithms for the MDVSP and compare the performance with the existing algorithms. The dataset includes a program that can be used to generate new problem instances of the MDVSP." }, @@ -125,7 +155,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, @@ -135,7 +166,24 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . nCorrespondingauthorat", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", + "start_index": 25 + }, + { + "text": "https", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", + "start_index": 25 + }, + { + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 .", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", + "start_index": 25 + } + ] }, "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India." }, @@ -145,7 +193,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "sarangkulkarni @ iitb . ac . in", + "url": "mailto:sarangkulkarni@iitb.ac.in", + "start_index": 16 + } + ] }, "text": "E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni)." }, @@ -155,7 +210,24 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [ + { + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 0 + }, + { + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 0 + }, + { + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 .", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 0 + } + ] }, "text": "https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, @@ -165,7 +237,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484–487" }, @@ -175,7 +248,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "485" }, @@ -185,7 +259,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Specifications table" }, @@ -195,7 +270,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" }, @@ -205,7 +281,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Data format Experimental factors" }, @@ -215,7 +292,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Experimental features Data source location Data accessibility Related research article" }, @@ -225,7 +303,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [ + { + "text": ".,", + "url": "https://orlib.uqcloud.net/", + "start_index": 444 + } + ] }, "text": "Tables, text files Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3]." }, @@ -235,7 +320,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Value of the data" }, @@ -245,7 +331,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" }, @@ -255,7 +342,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" }, @@ -265,7 +353,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "performance of the algorithms for the MDVSP." }, @@ -275,7 +364,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" }, @@ -285,7 +375,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "mathematical formulations." }, @@ -295,7 +386,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." }, @@ -305,7 +397,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "be used for the comparison." }, @@ -315,7 +408,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "1. Data" }, @@ -325,7 +419,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [ + { + "text": "2500 , and3000 . size , ðm ; nÞ , fiveinstancesareprovided . Thedatasetcanbedownloadedfromhttps :// orlib . uqcloud . net", + "url": "https://orlib.uqcloud.net", + "start_index": 509 + } + ] }, "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." }, @@ -335,7 +436,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "the size," }, @@ -345,7 +447,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "‘ðm; nÞ’," }, @@ -355,7 +458,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "respectively. For example," }, @@ -365,7 +469,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "the problem instance," }, @@ -375,7 +480,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip i A 1; 2; …; n, a start time, ts" }, @@ -385,7 +491,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Þ," }, @@ -395,7 +502,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "i , an end time, te" }, @@ -405,7 +513,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "i , a start location, ls" }, @@ -415,7 +524,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "i , and an end location, le i ," }, @@ -425,7 +535,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "and" }, @@ -435,7 +546,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "(cid:2) The travel time, δij, between any two locations i; j A 1; …; l." }, @@ -445,27 +557,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start" }, - { - "type": "UncategorizedText", - "element_id": "86b700fab5db37977a73700b53a0654b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "486" - }, { "type": "NarrativeText", "element_id": "0a1b09ff562f4d063703cbf021ee297f", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484–487" }, @@ -475,7 +579,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, m r l and the locations 1; …; m correspond to depots, while the remaining locations only appear as trip start and end locations." }, @@ -485,7 +590,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "j , the vehicle must travel empty from le j (cid:3)te i Þ. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" }, @@ -495,7 +601,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "A trip j can be covered after trip i by the same vehicle, if ts j" }, @@ -505,7 +612,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "i to ls" }, @@ -515,7 +623,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": ". If le i ls le i j , otherwise, the vehicle may require waiting at le i for the duration of ðts" }, @@ -525,7 +634,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Z te" }, @@ -535,7 +645,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "i þδ" }, @@ -545,7 +656,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "j" }, @@ -555,7 +667,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "a ls" }, @@ -565,7 +678,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] + }, + "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot." + }, + { + "type": "ListItem", + "element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3, + "links": [] }, "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot." }, @@ -575,7 +700,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." }, @@ -585,7 +711,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" }, @@ -595,7 +722,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "f" }, @@ -605,7 +733,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "(cid:1)" }, @@ -615,7 +744,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "(cid:3)" }, @@ -625,7 +755,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "." }, @@ -635,7 +766,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." }, @@ -645,7 +777,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size." }, @@ -655,7 +788,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Instance size (m, n)" }, @@ -665,7 +799,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Average number of" }, @@ -675,7 +810,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Locations" }, @@ -685,7 +821,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Times" }, @@ -695,7 +832,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Vehicles" }, @@ -705,7 +843,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Possible empty travels" }, @@ -715,7 +854,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)" }, @@ -725,7 +865,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20" }, @@ -735,7 +876,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60" }, @@ -745,7 +887,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60" }, @@ -755,7 +898,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60" }, @@ -765,7 +909,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484–487" }, @@ -775,7 +920,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "487" }, @@ -785,7 +931,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Table 2 Description of file format for each problem instance." }, @@ -795,7 +942,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Number of lines" }, @@ -805,7 +953,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Number of columns in each line" }, @@ -815,7 +964,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Description" }, @@ -825,7 +975,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1 1 n" }, @@ -835,7 +986,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "l" }, @@ -845,7 +997,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "3 m 4" }, @@ -855,7 +1008,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "l" }, @@ -865,7 +1019,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j." }, @@ -875,7 +1030,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "i , the end location le" }, @@ -885,7 +1041,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "i , the start" }, @@ -895,7 +1052,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2. Experimental design, materials, and methods" }, @@ -905,7 +1063,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The procedure presented by Carpaneto et al. in [1] is used to generate the problem instances. The same procedure has been used by Pepin et al. in [4] to generate the benchmark dataset of the MDVSP. A detailed description of the procedure is presented in [3]." }, @@ -915,7 +1074,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3]." }, @@ -925,7 +1085,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Transparency document. Supporting information" }, @@ -935,7 +1096,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": ":// doi", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 94 + } + ] }, "text": "Transparency document associated with this article can be found in the online version at https://doi." }, @@ -945,7 +1113,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "org / 10 . 1016 / j . dib . 2018 . 12 . 055", + "url": "https://doi.org/10.1016/j.dib.2018.12.055", + "start_index": 0 + } + ] }, "text": "org/10.1016/j.dib.2018.12.055." }, @@ -955,7 +1130,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "References" }, @@ -965,7 +1141,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth ,", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", + "start_index": 4 + } + ] }, "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling" }, @@ -975,7 +1158,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime – spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", + "start_index": 4 + } + ] }, "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur." }, @@ -985,7 +1175,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "problem , Networks19 ( 5 )( 1989 ) 531 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", + "start_index": 0 + }, + { + "text": "problem , Networks19 ( 5 )( 1989 ) 531 – 548", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", + "start_index": 0 + } + ] }, "text": "problem, Networks 19 (5) (1989) 531–548." }, @@ -995,7 +1197,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "J . Oper . Res . 175 ( 3 )( 2006 ) 1616 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", + "start_index": 0 + }, + { + "text": "J . Oper . Res . 175 ( 3 )( 2006 ) 1616 – 1627", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", + "start_index": 0 + } + ] }, "text": "J. Oper. Res. 175 (3) (2006) 1616–1627." }, @@ -1005,7 +1219,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , Anewformulationandacolumngeneration -", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", + "start_index": 4 + } + ] }, "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic" }, @@ -1015,7 +1236,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "forthemultipledepotvehicleschedulingproblem , Transp . Res . PartBMethodol . 118 ( 2018 ) 457 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", + "start_index": 0 + }, + { + "text": "forthemultipledepotvehicleschedulingproblem , Transp . Res . PartBMethodol . 118 ( 2018 ) 457 – 487", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", + "start_index": 0 + } + ] }, "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487." }, @@ -1025,7 +1258,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman ,", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", + "start_index": 4 + } + ] }, "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling" }, @@ -1035,7 +1275,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "problem , J . Sched . 12 ( 1 )( 2009 ) 17", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", + "start_index": 0 + } + ] }, "text": "problem, J. Sched. 12 (1) (2009) 17." }, @@ -1045,7 +1292,14 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "C . C . Ribeiro , F . Soumis , Acolumngenerationapproachtothemultiple - depotvehicleschedulingproblem , Oper . Res . 42 ( 1", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", + "start_index": 4 + } + ] }, "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1)" }, @@ -1055,7 +1309,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [ + { + "text": "( 1994 ) 41 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", + "start_index": 0 + }, + { + "text": "( 1994 ) 41 – 52", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", + "start_index": 0 + } + ] }, "text": "(1994) 41–52." } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 2f4b6f5b34..5844d4e791 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -5,7 +5,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "S32" }, @@ -15,7 +16,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Poster Session I" }, @@ -25,7 +27,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events." }, @@ -35,7 +38,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ." }, @@ -45,7 +49,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "S7. INVESTIGATING THE LINK BETWEEN THE PERIPHERAL ENDOCANNABINOID SYSTEM AND CENTRAL GLUTAMATERGIC NEUROTRANSMISSION IN EARLY PSYCHOSIS: A 7T-MRS STUDY" }, @@ -55,7 +60,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS" }, @@ -65,7 +71,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy" }, @@ -75,7 +82,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC." }, @@ -85,7 +93,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford" }, @@ -95,7 +104,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design." }, @@ -105,7 +115,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "S8. GRIN1 PROMOTER METHYLATION CHANGES IN BLOOD OF EARLY-ONSET PSYCHOTIC PATIENTS AND UNAFFECTED SIBLINGS WITH CHILDHOOD TRAUMA" }, @@ -115,7 +126,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2," }, @@ -125,7 +137,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "AQ3" }, @@ -135,7 +148,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "SIRS 2020 Abstracts" } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 9b293a695e..8a2764011f 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -5,7 +5,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "INTERNATIONAL MONETARY FUND" }, @@ -15,7 +16,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "WORLD ECONOMIC OUTLOOK UPDATE Inflation Peaking amid Low Growth" }, @@ -25,39 +27,43 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "2023 JAN" }, { - "type": "Title", - "element_id": "85e4ff3addb38328ecc08ec49759def7", + "type": "ListItem", + "element_id": "c4e0168ffab999611a92e8ebd8fe48a9", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, - "text": "Inflation Peaking amid Low Growth" + "text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." }, { "type": "ListItem", - "element_id": "f1d5f4ed63a14db581e985bf15416cdd", + "element_id": "c4e0168ffab999611a92e8ebd8fe48a9", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, - "text": "Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." + "text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." }, { "type": "ListItem", - "element_id": "c4e0168ffab999611a92e8ebd8fe48a9", + "element_id": "5e9b501fc056965a744f6598d022f31d", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, - "text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." + "text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." }, { "type": "ListItem", @@ -65,7 +71,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." }, @@ -75,7 +82,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "The global fight against inflation, Russia’s war in Ukraine, and a resurgence of COVID-19 in China weighed on global economic activity in 2022, and the first two factors will continue to do so in 2023." }, @@ -85,7 +93,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Despite these headwinds, real GDP was surprisingly strong in the third quarter of 2022 in numerous economies, including the United States, the euro area, and major emerging market and developing economies. The sources of these surprises were in many cases domestic: stronger-than-expected private consumption and investment amid tight labor markets and greater-than-anticipated fiscal support. Households spent more to satisfy pent-up demand, particularly on services, partly by drawing down their stock of savings as economies reopened. Business investment rose to meet demand. On the supply side, easing bottlenecks and declining transportation costs reduced pressures on input prices and allowed for a rebound in previously constrained sectors, such as motor vehicles. Energy markets have adjusted faster than expected to the shock from Russia’s invasion of Ukraine." }, @@ -95,7 +104,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "In the fourth quarter of 2022, however, this uptick is estimated to have faded in most—though not all––major economies. US growth remains stronger than expected, with consumers continuing to spend from their stock of savings (the personal saving rate is at its lowest in more than 60 years, except for July 2005), unemployment near historic lows, and plentiful job opportunities. But elsewhere, high-frequency activity indicators (such as business and consumer sentiment, purchasing manager surveys, and mobility indicators) generally point to a slowdown." }, @@ -105,7 +115,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "COVID-19 deepens China’s slowdown. Economic activity in China slowed in the fourth quarter amid multiple large COVID-19 outbreaks in Beijing and other densely populated localities. Renewed lockdowns accompanied the outbreaks until the relaxation of COVID-19 restrictions in November and December, which paved the way for a full reopening. Real estate investment continued to contract, and developer restructuring is proceeding slowly, amid the lingering property market crisis. Developers have yet to deliver on a large backlog of presold housing, and downward pressure is building on house prices (so far limited by home price floors). The authorities have responded with additional monetary and fiscal policy easing, new vaccination targets for the elderly, and steps to support the completion of unfinished real estate projects. However, consumer and business sentiment remained subdued in late 2022. China’s slowdown has reduced global trade growth and international commodity prices." }, @@ -115,7 +126,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Monetary policy starts to bite. Signs are apparent that monetary policy tightening is starting to cool demand and inflation, but the full impact is unlikely to be realized before 2024. Global headline inflation appears to have peaked in the third quarter of 2022 (Figure 1). Prices of fuel and nonfuel commodities have declined, lowering headline inflation, notably in the United States, the euro area, and Latin America. But underlying (core) inflation has not yet peaked in most economies and remains well above pre-pandemic levels. It has persisted amid second-round effects from earlier cost shocks and tight labor markets with robust wage growth as consumer demand has remained resilient. Medium-term inflation expectations generally remain anchored, but some gauges are up. These developments have caused central banks to raise rates faster than expected, especially in the United States and the euro area, and to signal that rates will stay elevated for longer. Core inflation is declining in some economies that have completed their tightening cycle—such as Brazil. Financial markets are displaying high sensitivity to inflation news, with equity markets rising following recent releases of lower inflation data in anticipation of interest rate cuts (Box 1), despite central banks’ communicating their resolve to tighten policy further. With the peak in US headline inflation and an acceleration in rate hikes by several non-US central banks, the dollar has weakened since September but remains significantly stronger than a year ago." }, @@ -125,7 +137,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)" }, @@ -135,7 +148,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "18 16 14 12 10 8 6 4 2 0 –2" }, @@ -145,7 +159,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "–2" }, @@ -155,7 +170,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies." }, @@ -165,7 +181,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "16 14 12 10 8 6 4 2 0" }, @@ -175,7 +192,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 2019" }, @@ -185,7 +203,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 2019" }, @@ -195,7 +214,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "1. Headline Inflation" }, @@ -205,7 +225,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "2. Core Inflation" }, @@ -215,7 +236,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Median country Brazil" }, @@ -225,7 +247,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 19" }, @@ -235,7 +258,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 19" }, @@ -245,7 +269,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 20" }, @@ -255,7 +280,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 20" }, @@ -265,7 +291,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 20" }, @@ -275,7 +302,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 20" }, @@ -285,7 +313,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "United States" }, @@ -295,7 +324,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 21" }, @@ -305,7 +335,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 21" }, @@ -315,7 +346,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 21" }, @@ -325,7 +357,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 21" }, @@ -335,7 +368,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 22" }, @@ -345,7 +379,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jan. 22" }, @@ -355,7 +390,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Euro area" }, @@ -365,7 +401,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 22" }, @@ -375,7 +412,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Jul. 22" }, @@ -385,7 +423,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Nov. 22" }, @@ -395,7 +434,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Nov. 22" }, @@ -405,7 +445,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience––which is" }, @@ -415,7 +456,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "visible in consumption and investment data for the third quarter––partly reflects government support of about 1.2 percent of European Union GDP (net budgetary cost) to households and firms hit by the energy crisis, as well as dynamism from economies reopening. Gas prices have declined by more than expected amid higher non-Russian pipeline and liquefied natural gas flows, compression of demand for gas, and a warmer-than-usual winter. However, the boost from reopening appears to be fading. High-frequency indicators for the fourth quarter suggest that the manufacturing and services sectors are contracting. Consumer confidence and business sentiment have worsened. With inflation at about 10 percent or above in several euro area countries and the United Kingdom, household budgets remain stretched. The accelerated pace of rate increases by the Bank of England and the European Central Bank is tightening financial conditions and cooling demand in the housing sector and beyond." }, @@ -425,7 +467,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Global growth, estimated at 3.4 percent in 2022, is projected to fall to 2.9 percent in 2023 before rising to 3.1 percent in 2024 (Table 1). Compared with the October forecast, the estimate for 2022 and the forecast for 2023 are both higher by about 0.2 percentage point, reflecting positive surprises and greater-than-expected resilience in numerous economies. Negative growth in global GDP or global GDP per capita—which often happens when there is a global recession—is not expected. Nevertheless, global growth projected for 2023 and 2024 is below the historical (2000–19) annual average of 3.8 percent." }, @@ -435,7 +478,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The forecast of low growth in 2023 reflects the rise in central bank rates to fight inflation–– especially in advanced economies––as well as the war in Ukraine. The decline in growth in 2023 from 2022 is driven by advanced economies; in emerging market and developing economies, growth is estimated to have bottomed out in 2022. Growth is expected to pick up in China with the full reopening in 2023. The expected pickup in 2024 in both groups of economies reflects gradual recovery from the effects of the war in Ukraine and subsiding inflation. Following the path of global demand, world trade growth is expected to decline in 2023 to 2.4 percent, despite an easing of supply bottlenecks, before rising to 3.4 percent in 2024." }, @@ -445,7 +489,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "These forecasts are based on a number of assumptions, including on fuel and nonfuel commodity prices, which have generally been revised down since October, and on interest rates, which have been revised up. In 2023, oil prices are projected to fall by about 16 percent, while nonfuel commodity prices are expected to fall by, on average, 6.3 percent. Global interest rate assumptions are revised up, reflecting intensified actual and signaled policy tightening by major central banks since October." }, @@ -455,7 +500,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "For advanced economies, growth is projected to decline sharply from 2.7 percent in 2022 to 1.2 percent in 2023 before rising to 1.4 percent in 2024, with a downward revision of 0.2 percentage point for 2024. About 90 percent of advanced economies are projected to see a decline in growth in 2023." }, @@ -465,7 +511,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -475,39 +522,32 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "In the United States, growth is projected to fall from 2.0 percent in 2022 to 1.4 percent in 2023 and 1.0 percent in 2024. With growth rebounding in the second half of 2024, growth in 2024 will be faster than in 2023 on a fourth-quarter-over-fourth-quarter basis, as in most advanced" }, - { - "type": "NarrativeText", - "element_id": "70f05b9620aa1b7236058898e7e59192", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023." - }, { "type": "ListItem", - "element_id": "fd6c549473e196512c076844988f465c", + "element_id": "3be6554964c172468cceaee89294f59d", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, - "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers." + "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." }, { "type": "ListItem", - "element_id": "3be6554964c172468cceaee89294f59d", + "element_id": "b24771387a5318eeda21adaa49629186", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, - "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." + "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, { "type": "ListItem", @@ -515,7 +555,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, @@ -525,7 +566,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "For emerging market and developing economies, growth is projected to rise modestly, from 3.9 percent in 2022 to 4.0 percent in 2023 and 4.2 percent in 2024, with an upward revision of 0.3 percentage point for 2023 and a downward revision of 0.1 percentage point for 2024. About half of emerging market and developing economies have lower growth in 2023 than in 2022." }, @@ -535,7 +577,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] + }, + "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." + }, + { + "type": "ListItem", + "element_id": "2ba41350ae3c684802f0e2b785c2d11b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 5, + "links": [] }, "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." }, @@ -545,7 +599,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, @@ -555,7 +610,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "" }, @@ -565,7 +621,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." }, @@ -575,7 +632,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." }, @@ -585,7 +643,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "" }, @@ -595,7 +654,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies." }, @@ -605,7 +665,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average." }, @@ -615,27 +676,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook." }, - { - "type": "UncategorizedText", - "element_id": "8f81c653cbf1334344d3063cb9f4de04", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)" - }, { "type": "Title", "element_id": "d11a1c04bd3a9891350b4bd94104df58", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Year over Year" }, @@ -645,7 +698,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Difference from October 2022" }, @@ -655,7 +709,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Q4 over Q4 2/" }, @@ -665,7 +720,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2021" }, @@ -675,7 +731,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Estimate 2022" }, @@ -685,7 +742,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Projections 2023" }, @@ -695,7 +753,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2024" }, @@ -705,7 +764,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "WEO Projections 1/" }, @@ -715,7 +775,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2023" }, @@ -725,7 +786,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2024" }, @@ -735,7 +797,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Estimate 2022" }, @@ -745,7 +808,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Projections 2023" }, @@ -755,7 +819,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2024" }, @@ -765,7 +830,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "World Output" }, @@ -775,7 +841,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "6.2" }, @@ -785,7 +852,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.4" }, @@ -795,7 +863,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.9" }, @@ -805,7 +874,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.1" }, @@ -815,7 +885,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.2" }, @@ -825,7 +896,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.1" }, @@ -835,7 +907,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.9" }, @@ -845,7 +918,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.2" }, @@ -855,7 +929,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.0" }, @@ -865,7 +940,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Advanced Economies United States Euro Area" }, @@ -875,7 +951,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Germany France Italy Spain" }, @@ -885,7 +962,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Japan United Kingdom Canada Other Advanced Economies 3/" }, @@ -895,7 +973,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3" }, @@ -905,7 +984,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8" }, @@ -915,7 +995,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0" }, @@ -925,7 +1006,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4" }, @@ -935,7 +1017,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3" }, @@ -945,7 +1028,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2" }, @@ -955,7 +1039,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4" }, @@ -965,7 +1050,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1" }, @@ -975,7 +1061,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" }, @@ -985,7 +1072,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Emerging Market and Developing Economies Emerging and Developing Asia" }, @@ -995,7 +1083,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "China India 4/" }, @@ -1005,7 +1094,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Emerging and Developing Europe" }, @@ -1015,7 +1105,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Russia" }, @@ -1025,7 +1116,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Latin America and the Caribbean" }, @@ -1035,7 +1127,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Middle East and Central Asia" }, @@ -1045,7 +1138,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Brazil Mexico" }, @@ -1055,7 +1149,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Saudi Arabia Sub-Saharan Africa" }, @@ -1065,7 +1160,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Nigeria South Africa" }, @@ -1075,7 +1171,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9" }, @@ -1085,7 +1182,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6" }, @@ -1095,7 +1193,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2" }, @@ -1105,7 +1204,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3" }, @@ -1115,7 +1215,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1" }, @@ -1125,7 +1226,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0" }, @@ -1135,7 +1237,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0" }, @@ -1145,7 +1248,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5" }, @@ -1155,7 +1259,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" }, @@ -1165,7 +1270,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries" }, @@ -1175,7 +1281,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "6.0 5.5 3.8 4.1 7.0 4.1" }, @@ -1185,7 +1292,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.1 3.7 5.2 5.4 3.8 4.9" }, @@ -1195,7 +1303,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.4 0.7 4.3 3.2 4.0 4.9" }, @@ -1205,7 +1314,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.5 1.8 4.7 3.5 4.1 5.6" }, @@ -1215,7 +1325,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.3 0.0 –0.2 –0.4 0.4 0.0" }, @@ -1225,7 +1336,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1" }, @@ -1235,7 +1347,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "1.7 1.8 3.7 . . . 2.5 . . ." }, @@ -1245,7 +1358,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.5 1.2 5.7 . . . 5.0 . . ." }, @@ -1255,7 +1369,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.5 2.0 4.0 . . . 4.1 . . ." }, @@ -1265,7 +1380,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies" }, @@ -1275,7 +1391,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "10.4 9.4 12.1" }, @@ -1285,7 +1402,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5.4 6.6 3.4" }, @@ -1295,7 +1413,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "2.4 2.3 2.6" }, @@ -1305,7 +1424,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.4 2.7 4.6" }, @@ -1315,7 +1435,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.1 0.0 –0.3" }, @@ -1325,7 +1446,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.3 –0.4 0.0" }, @@ -1335,7 +1457,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": ". . . . . . . . ." }, @@ -1345,7 +1468,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": ". . . . . . . . ." }, @@ -1355,7 +1479,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": ". . . . . . . . ." }, @@ -1365,7 +1490,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights)" }, @@ -1375,7 +1501,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "65.8 26.4" }, @@ -1385,7 +1512,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "39.8 7.0" }, @@ -1395,7 +1523,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–16.2 –6.3" }, @@ -1405,7 +1534,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–7.1 –0.4" }, @@ -1415,7 +1545,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–3.3 –0.1" }, @@ -1425,7 +1556,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–0.9 0.3" }, @@ -1435,7 +1567,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "11.2 –2.0" }, @@ -1445,7 +1578,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–9.8 1.4" }, @@ -1455,7 +1589,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "–5.9 –0.2" }, @@ -1465,7 +1600,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/" }, @@ -1475,7 +1611,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "4.7 3.1 5.9" }, @@ -1485,7 +1622,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "8.8 7.3 9.9" }, @@ -1495,7 +1633,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "6.6 4.6 8.1" }, @@ -1505,7 +1644,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "4.3 2.6 5.5" }, @@ -1515,7 +1655,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.1 0.2 0.0" }, @@ -1525,7 +1666,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0.2 0.2 0.2" }, @@ -1535,7 +1677,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "9.2 7.8 10.4" }, @@ -1545,7 +1688,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5.0 3.1 6.6" }, @@ -1555,7 +1699,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "3.5 2.3 4.5" }, @@ -1565,7 +1710,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024." }, @@ -1575,7 +1721,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:" }, @@ -1585,49 +1732,54 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism." }, { - "type": "NarrativeText", - "element_id": "d379a79a55cecddeed62b21eb6a0ff00", + "type": "ListItem", + "element_id": "cf20f95904c591b6ac4ccd5d43fa8a98", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7, + "links": [] }, - "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China." + "text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism." }, { "type": "ListItem", - "element_id": "2bbe57e6c291db638d3fcddca9e0199a", + "element_id": "90a90e12a4c6b8b74d3c8d20a76f22dc", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, - "text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." + "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." }, { - "type": "NarrativeText", - "element_id": "a2f806b25a06969405637298b4c85139", + "type": "ListItem", + "element_id": "90a90e12a4c6b8b74d3c8d20a76f22dc", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, - "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" + "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." }, { "type": "ListItem", - "element_id": "90a90e12a4c6b8b74d3c8d20a76f22dc", + "element_id": "42ac57e394bf7c98d908745cefce0b80", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, - "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." + "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." }, { "type": "ListItem", @@ -1635,7 +1787,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." }, @@ -1645,7 +1798,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." }, @@ -1655,7 +1809,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "" }, @@ -1665,7 +1820,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." }, @@ -1675,7 +1831,19 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] + }, + "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." + }, + { + "type": "ListItem", + "element_id": "75bd22ee0ba778cc3a616ed0a9b42292", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 8, + "links": [] }, "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." }, @@ -1685,7 +1853,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Fragmentation could intensify—with more restrictions on cross-border movements of capital, workers, and international payments—and could hamper multilateral cooperation on providing global public goods.1 The costs of such fragmentation are especially high in the short term, as replacing disrupted cross-border flows takes time." }, @@ -1695,7 +1864,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Securing global disinflation: For most economies, the priority remains achieving a sustained reduction in inflation toward target levels. Raising real policy rates and keeping them above their neutral levels until underlying inflation is clearly declining would ward off risks of inflation expectations de- anchoring. Clear central bank communication and appropriate reactions to shifts in the data will help keep inflation expectations anchored and lessen wage and price pressures. Central banks’ balance sheets will need to be unwound carefully, amid market liquidity risks. Gradual and steady fiscal tightening would contribute to cooling demand and limit the burden on monetary policy in the fight against inflation. In countries where output remains below potential and inflation is in check, maintaining monetary and fiscal accommodation may be appropriate." }, @@ -1705,7 +1875,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Containing the reemergence of COVID-19: Addressing the ongoing pandemic requires coordinated efforts to boost vaccination and medicine access in countries where coverage remains low as well as the deployment of pandemic preparedness measures—including a global push toward sequencing and sharing data. In China, focusing vaccination efforts on vulnerable groups and maintaining sufficiently high coverage of boosters and antiviral medicines would minimize the risks of severe health outcomes and safeguard the recovery, with favorable cross-border spillovers." }, @@ -1715,7 +1886,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Ensuring financial stability: Depending on country circumstances, macroprudential tools can be used to tackle pockets of elevated financial sector vulnerabilities. Monitoring housing sector developments and conducting stress tests in economies where house prices have increased significantly over the past few years are warranted. In China, central government action to resolve the property crisis and reduce the risk of spillovers to financial stability and growth is a priority, including by strengthening temporary mechanisms to protect presale homebuyers from the risk of non-delivery and by restructuring troubled developers. Globally, financial sector regulations introduced after the global financial crisis have contributed to the resilience of banking sectors throughout the pandemic, but there is a need to address data and supervisory gaps in the less-regulated nonbank financial sector, where risks may have built up inconspicuously. Recent turmoil in the crypto space also highlights the urgent need to introduce common standards and reinforce oversight of crypto assets." }, @@ -1725,7 +1897,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Restoring debt sustainability: Lower growth and higher borrowing costs have raised public debt ratios in several economies. Where debt is unsustainable, implementing restructuring or reprofiling early on as part of a package of reforms (including fiscal consolidation and growth-enhancing supply-side reforms) can avert the need for more disruptive adjustment later." }, @@ -1735,7 +1908,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Supporting the vulnerable: The surge in global energy and food prices triggered a cost-of-living crisis. Governments acted swiftly with support to households and firms, which helped cushion effects on growth and at times limited the pass-through from energy prices to headline inflation through price" }, @@ -1745,69 +1919,76 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "1 See “Geo-Economic Fragmentation and the Future of Multilateralism,” IMF Staff Discussion Note 2023/001." }, { - "type": "NarrativeText", - "element_id": "1344e770221822b381fb428d9390a446", + "type": "ListItem", + "element_id": "bd7674df887463bc9f05c8030a151dea", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "controls. The temporary and broad-based measures are becoming increasingly costly and should be withdrawn and replaced by targeted approaches. Preserving the energy price signal will encourage a reduction in energy consumption and limit the risks of shortages. Targeting can be achieved through social safety nets such as cash transfers to eligible households based on income or demographics or by transfers through electricity companies based on past energy consumption. Subsidies should be temporary and offset by revenue-generating measures, including one-time solidarity taxes on high- income households and companies, where appropriate." + "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." }, { - "type": "NarrativeText", - "element_id": "5f63f2b3388c5c9f2ab22f4136d4196d", + "type": "ListItem", + "element_id": "bd7674df887463bc9f05c8030a151dea", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "Reinforcing supply: Supply-side policies could address the key structural factors impeding growth— including market power, rent seeking, rigid regulation and planning, and inefficient education—and could help build resilience, reduce bottlenecks, and alleviate price pressures. A concerted push for investment along the supply chain of green energy technologies would bolster energy security and help advance progress on the green transition." + "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." }, { - "type": "NarrativeText", - "element_id": "c64f29a38dae74989484539db014364f", + "type": "ListItem", + "element_id": "af6eef18ec41f4980c1a4cbb5b7d4fec", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "Strengthening multilateral cooperation—Urgent action is needed to limit the risks stemming from geopolitical fragmentation and to ensure cooperation on fundamental areas of common interest:" + "text": "Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." }, { "type": "ListItem", - "element_id": "bd7674df887463bc9f05c8030a151dea", + "element_id": "af6eef18ec41f4980c1a4cbb5b7d4fec", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." + "text": "Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." }, { "type": "ListItem", - "element_id": "af6eef18ec41f4980c1a4cbb5b7d4fec", + "element_id": "d6f6afcf055ed3084a0fac1093458c88", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." + "text": "Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." }, { "type": "ListItem", - "element_id": "d6f6afcf055ed3084a0fac1093458c88", + "element_id": "089c5759e7030e34a3b537d9e20bcd13", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, - "text": "Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." + "text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, { "type": "ListItem", @@ -1815,7 +1996,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, @@ -1825,7 +2007,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." }, @@ -1835,7 +2018,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, @@ -1845,7 +2029,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" }, @@ -1855,7 +2040,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "7" }, @@ -1865,7 +2051,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "6" }, @@ -1875,7 +2062,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "5" }, @@ -1885,7 +2073,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "4" }, @@ -1895,7 +2084,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "United States Euro area China Other AEs Other EMs" }, @@ -1905,7 +2095,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "October 2022 GFSR" }, @@ -1915,7 +2106,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "3" }, @@ -1925,7 +2117,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "2" }, @@ -1935,7 +2128,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "1" }, @@ -1945,7 +2139,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "0" }, @@ -1955,7 +2150,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "–1" }, @@ -1965,7 +2161,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "–2" }, @@ -1975,7 +2172,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "–3" }, @@ -1985,7 +2183,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "2006 08 08" }, @@ -1995,7 +2194,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "06" }, @@ -2005,7 +2205,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "10 10" }, @@ -2015,7 +2216,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "12 12" }, @@ -2025,7 +2227,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "14 16 14" }, @@ -2035,7 +2238,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "16" }, @@ -2045,7 +2249,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "18 18" }, @@ -2055,7 +2260,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "20 22 22" }, @@ -2065,7 +2271,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "20" }, @@ -2075,7 +2282,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." }, @@ -2085,7 +2293,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, @@ -2095,7 +2304,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Latest" }, @@ -2105,7 +2315,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "October 2022 GFSR" }, @@ -2115,7 +2326,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "6" }, @@ -2125,7 +2337,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "1. United States" }, @@ -2135,7 +2348,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "2. Euro area" }, @@ -2145,7 +2359,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "5" }, @@ -2155,7 +2370,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "4" }, @@ -2165,7 +2381,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "3" }, @@ -2175,7 +2392,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "2" }, @@ -2185,7 +2403,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "1" }, @@ -2195,7 +2414,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Oct. 22" }, @@ -2205,7 +2425,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Apr. 23" }, @@ -2215,7 +2436,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Oct. 23" }, @@ -2225,7 +2447,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Dec. 24" }, @@ -2235,7 +2458,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Dec. 26" }, @@ -2245,7 +2469,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Oct. 22" }, @@ -2255,7 +2480,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Apr. 23" }, @@ -2265,7 +2491,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Oct. 23" }, @@ -2275,7 +2502,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Dec. 24" }, @@ -2285,7 +2513,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Dec. 26" }, @@ -2295,7 +2524,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "5" }, @@ -2305,7 +2535,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "4" }, @@ -2315,7 +2546,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "3" }, @@ -2325,7 +2557,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "2" }, @@ -2335,7 +2568,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "1" }, @@ -2345,7 +2579,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, @@ -2355,7 +2590,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report." }, @@ -2365,7 +2601,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "Financial market volatility is expected to remain elevated and could be exacerbated by poor market liquidity. For some asset classes (such as US Treasuries), liquidity has deteriorated to the March 2020 lows of the COVID-19 pandemic. With the process of central bank balance sheet reduction (quantitative tightening) underway, market liquidity is expected to remain challenging." }, @@ -2375,7 +2612,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 11 + "page_number": 11, + "links": [] }, "text": "WEO Update © 2023 • ISBN: 979-8-40023-224-4" } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json index 2f6c16233d..18c9c5ac9e 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -5,7 +5,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "The Silent Giant" }, @@ -15,7 +16,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "The need for nuclear in a clean energy system" }, @@ -25,7 +27,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Executive Summary" }, @@ -35,7 +38,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "In a world centred on short-term fixes, many of the traits that make nuclear energy a key player in the transition to a sustainable world are not properly valued and often taken for granted. Reflecting on the popular discourse in the world of energy politics it would seem that renewables, and renewables alone, will be responsible for, and capable of, delivering a zero-carbon energy system – and that it is just a matter of time." }, @@ -45,7 +49,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The reality today is that both global carbon dioxide emissions and fossil fuel use are still on the rise. This does not only make the battle against climate change much harder, but also results in hundreds of thousands of pollution deaths every year." }, @@ -55,7 +60,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Energy is the essential agent for promoting human development, and global demand is projected to increase significantly in the coming decades. Securing access to modern and affordable energy is essential for lifting people out of poverty, and for promoting energy independence and economic growth." }, @@ -65,7 +71,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Nuclear energy is a proven solution with a long and well-established track record. Nuclear reactors – a grand total of 445 in 30 countries – are the low-carbon backbone of electricity systems, operating in the background, day in and day out, often out of sight and out of mind. Capable of generating immense amounts of clean power, they are the silent giants upon which we rely daily." }, @@ -75,7 +82,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Nuclear energy has shown – be it in France or Sweden – that it has the potential to be the catalyst for delivering sustainable energy transitions, long before climate change was on the agenda. The use of nuclear energy is the fast track to a high-powered and clean energy system, which not only delivers a healthier environment and an affordable supply of electricity, but also strengthens energy security and helps mitigate climate change." }, @@ -85,7 +93,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The global nuclear industry, led by World Nuclear Association, is ready to take on the challenge. As part of the Harmony Programme, we have set a target to build an additional 1000GWe of reactors across the world before 2050, bringing the global share of electricity production of nuclear to 25%." }, @@ -95,7 +104,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "In order to realise the full potential of nuclear energy we have identified three key areas where actions are required:" }, @@ -105,7 +115,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The need to create a level playing field that values reliability and energy security" }, @@ -115,7 +126,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The need for harmony in the nuclear regulatory environment" }, @@ -125,7 +137,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "The need for a holistic safety paradigm for the whole electricity system." }, @@ -135,7 +148,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "1" }, @@ -145,7 +159,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The drivers for a clean energy system" }, @@ -155,7 +170,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Electricity is central to modern life – it powers our daily lives, as well as our dreams and ambitions. Demand has grown steadily for more than 100 years, and will continue to do so as many parts of the world continue to develop, and electrification takes a central role in efforts to decarbonize (Figure 1). With nearly a billion people around the world still living in the dark, without access to electricity, humanity has a responsibility to learn from the past - everyone has the right to enjoy a modern lifestyle in a way that does not cause harm to people or the planet." }, @@ -165,7 +181,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "45,000" }, @@ -175,7 +192,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Marine" }, @@ -185,7 +203,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "40,000" }, @@ -195,7 +214,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " CSP" }, @@ -205,7 +225,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "35,000" }, @@ -215,7 +236,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Solar PV" }, @@ -225,7 +247,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Geothermal" }, @@ -235,7 +258,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "30,000" }, @@ -245,7 +269,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Wind" }, @@ -255,7 +280,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "h W T" }, @@ -265,7 +291,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "25,000" }, @@ -275,7 +302,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Bioenergy" }, @@ -285,7 +313,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "20,000" }, @@ -295,7 +324,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Hydro" }, @@ -305,7 +335,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Nuclear" }, @@ -315,7 +346,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "15,000" }, @@ -325,7 +357,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Gas" }, @@ -335,7 +368,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "10,000" }, @@ -345,7 +379,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Oil" }, @@ -355,7 +390,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "5,000" }, @@ -365,7 +401,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " Coal" }, @@ -375,7 +412,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0" }, @@ -385,7 +423,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2000" }, @@ -395,7 +434,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2010" }, @@ -405,7 +445,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2020" }, @@ -415,7 +456,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2030" }, @@ -425,7 +467,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2040" }, @@ -435,7 +478,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Figure 1. IEA projected electricity production and sources to 2040 i" }, @@ -445,7 +489,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "The challenge before us, however, goes far beyond just electricity – we will need to find ways to decarbonize all parts of the economy, and we need solutions that are sustainable in the long-term. That means changing the way we heat our homes and power our industrial processes, as well as ensuring that the way we travel, export our products and ship our food moves away from fossil fuels." }, @@ -455,7 +500,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear – instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall." }, @@ -465,7 +511,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "30,000,000" }, @@ -475,7 +522,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": " High-carbon  Low-carbon" }, @@ -485,7 +533,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "25,000,000" }, @@ -495,7 +544,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "20,000,000" }, @@ -505,7 +555,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "h W G" }, @@ -515,7 +566,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "15,000,000" }, @@ -525,7 +577,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "10,000,000" }, @@ -535,7 +588,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "5,000,000" }, @@ -545,7 +599,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "0" }, @@ -555,7 +610,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1990" }, @@ -565,7 +621,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1995" }, @@ -575,7 +632,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2000" }, @@ -585,7 +643,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2005" }, @@ -595,7 +654,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2010" }, @@ -605,7 +665,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2015" }, @@ -615,7 +676,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Figure 2. Worldwide electricity generation by fuel (1990-2016)ii" }, @@ -625,7 +687,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -635,7 +698,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "We need to deliver a worldwide transformation that is socially, economically and environmentally sustainable. We need a system that is affordable – no one should have to choose between heating their home, and essentials like eating – as well as helping to alleviate poverty, and ensure the realization of human potential globally. We need a power source that can not only help us mitigate the effects of climate change and environmental degradation, but can also help bring the enormous benefits of reliable electricity supply to the corners of the world that do not have access to it." }, @@ -645,7 +709,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Nuclear energy is already making a major contribution. By using nuclear energy rather than fossil fuels, we currently avoid the emission of more than 2500 million tonnes of carbon dioxide every year. To put that into perspective, it is the equivalent of removing about 400 million cars from the world’s roads." }, @@ -655,7 +720,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Modern society is dependent on the steady supply of electricity, every day of the year – regardless of weather, season or time of day – and nuclear energy is particularly well-suited to providing this service. Given that the majority of baseload supply is fossil-based, an increase in the use of nuclear energy would result in a rapid decarbonization of the electricity system. The International Energy Agency’s (IEA) recent report III on nuclear energy highlighted the importance of dependable baseload electricity generators and the need to properly value and compensate them for the electricity security and reliability services they provide." }, @@ -665,7 +731,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "3" }, @@ -675,7 +742,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Despite impressive recent growth, the stark reality is that renewables alone will not be able to resolve our dependence on fossil fuels. Clearly, the sun does not always shine, and the wind does not always blow, and this is compounded by the fact that many times these periods coincide with when electricity demand is at its highest, but renewables can be complementary to nuclear energy. Storage solutions, such as batteries, will not be able to power our societies for days or weeks when the weather is not favourable. Natural gas is currently the most used solution for the intermittency problem, which only serves to reinforce our economy’s dependence of fossil fuels, and severely undermines the apparently ‘green credentials’ of many renewables." }, @@ -685,7 +753,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Moving to a sustainable future" }, @@ -695,7 +764,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The Intergovernmental Panel on Climate Change (IPCC) special report on Global Warming of 1.5°C iv examined a large number of different scenarios for limiting global warming to 1.5°C. Of those scenarios which would achieve the 1.5°C target, the mean increase in nuclear energy’s contribution to electricity production was 2.5 times higher compared to today. However, the ‘middle-of-the-road’ scenario – in which social, economic, and technological trends follow current patterns and would not require major changes to, for example, diet and travel habits – sees the need for nuclear increase by five times globally by 2050." }, @@ -705,7 +775,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The IEA has concluded that without an expanded contribution from nuclear energy, the already huge challenge of achieving emissions reductions will become drastically harder and more costly. In their latest report on nuclear energy v, published in 2019, they also conclude that not using nuclear would have negative implications for energy security and result in higher costs for the consumers. The IEA recommends policy reforms to ‘… ensure competition on a level playing field’ and that the ‘… focus should be on designing electricity markets in a way that values the clean energy and energy security attributes of low-carbon technologies, including nuclear power.’ Such reforms should also ensure that reliability of electricity production is properly valued and compensated." }, @@ -715,7 +786,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "As part of the Harmony Programme, the world’s nuclear industry has identified three key policy areas for action to unlock the true potential of nuclear energy - the need for a level playing field, the harmonization of regulations and the establishment of an effective safety paradigm." }, @@ -725,7 +797,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "In regard to the need for a level playing field, we see that many of the world’s electricity markets operate in an unsustainable fashion, dominated by short-term thinking. Electricity supply which is affordable, reliable and available 24/7 generates broad societal benefits, and as seen in Figure 3, nuclear is one of the most affordable electricity sources." }, @@ -735,7 +808,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "4" }, @@ -745,7 +819,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "300" }, @@ -755,7 +830,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "250" }, @@ -765,7 +841,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "200" }, @@ -775,7 +852,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "h W M / $" }, @@ -785,7 +863,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "150" }, @@ -795,7 +874,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "100" }, @@ -805,7 +885,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "50" }, @@ -815,7 +896,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "0" }, @@ -825,7 +907,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "C o m" }, @@ -835,7 +918,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "m ercial Photovoltaic" }, @@ -845,7 +929,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "O nshore Wind" }, @@ -855,7 +940,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Offshore Wind" }, @@ -865,7 +951,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "N uclear" }, @@ -875,7 +962,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "C C G T" }, @@ -885,7 +973,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "C oal" }, @@ -895,7 +984,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Figure 3. Comparative cost projections for main electricity generators vi" }, @@ -905,7 +995,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "However, markets fail to give due credit to electricity generators, such as nuclear energy, that are able to meet these societal demands. This has resulted in situations where nuclear energy has struggled to compete with energy sources that have been subsidized, do not pay the hidden costs brought on by their intermittency (e.g. costly backup provisions and investments in the grid), or do not have to take responsibility for using our common atmosphere as a dumping ground." }, @@ -915,7 +1006,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life." }, @@ -925,7 +1017,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation." }, @@ -935,7 +1028,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "The International Atomic Energy Agency (IAEA) has highlighted the importance of addressing this issue, concluding that the lack of regulatory harmony ‘…causes many drawbacks for the entire nuclear industry, including developers, vendors, operators and even regulators themselves…This results in increased costs and reduced predictability in project execution’. vii It is therefore crucial that we harmonize the regulatory process to address these weaknesses, and avoid unnecessary duplication and inconsistencies." }, @@ -945,7 +1039,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5" }, @@ -955,7 +1050,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "In regard to the need for a holistic safety paradigm for the whole electricity system, we need to consider safety from a societal perspective, something the current energy system fails to do. The health, environmental and safety benefits of nuclear energy are not sufficiently understood and valued when compared with other electricity sources. Nuclear energy remains the safest form of electricity generation (Figure 4). Additionally, the use of nuclear consistently prevents many tens of thousands of deaths (mainly resulting from air pollution) every year by avoiding the use of coal - lifesaving measures which must be better recognised and valued." }, @@ -965,7 +1061,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "140" }, @@ -975,7 +1072,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "r a e y" }, @@ -985,7 +1083,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "W T" }, @@ -995,7 +1094,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "e" }, @@ -1005,7 +1105,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "r e p s e i t i l" }, @@ -1015,7 +1116,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "a t a F" }, @@ -1025,7 +1127,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "120" }, @@ -1035,7 +1138,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "100" }, @@ -1045,7 +1149,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "80" }, @@ -1055,7 +1160,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "60" }, @@ -1065,7 +1171,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "40" }, @@ -1075,7 +1182,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "20" }, @@ -1085,7 +1193,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "0" }, @@ -1095,7 +1204,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "120" }, @@ -1105,7 +1215,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "C oal" }, @@ -1115,7 +1226,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "99.5" }, @@ -1125,7 +1237,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Oil" }, @@ -1135,7 +1248,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "71.9" }, @@ -1145,7 +1259,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "N atural gas" }, @@ -1155,7 +1270,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "8.5" }, @@ -1165,7 +1281,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "1.78" }, @@ -1175,7 +1292,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Offshore wind" }, @@ -1185,7 +1303,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "(U K)" }, @@ -1195,7 +1314,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "O nshore wind (G erm any)" }, @@ -1205,7 +1325,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "0.245" }, @@ -1215,7 +1336,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "S olar P V" }, @@ -1225,7 +1347,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "<0.01" }, @@ -1235,7 +1358,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "N uclear*" }, @@ -1245,7 +1369,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Figure 4. Comparison of number of fatalities due to electricity generation viii" }, @@ -1255,7 +1380,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Nuclear for a sustainable tomorrow" }, @@ -1265,7 +1391,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Nuclear energy is already making a significant contribution to providing the world with clean and abundant electricity, and has a proven track record of being a reliable workhorse around the world. Countries like France, Sweden and Switzerland have proven that it is possible to divorce economic growth from an increase in damaging emissions and over the timescales required to effectively challenge climate change and environmental degradation (Figures 5 and 6). Nuclear can ensure that fast-growing populations achieve rising standards of living – without having to sacrifice the planet or their own well-being." }, @@ -1275,7 +1402,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "100" }, @@ -1285,7 +1413,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "90" }, @@ -1295,7 +1424,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Coal" }, @@ -1305,7 +1435,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Gas/Oil" }, @@ -1315,7 +1446,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "80" }, @@ -1325,7 +1457,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Biofuels/Waste" }, @@ -1335,7 +1468,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "70" }, @@ -1345,7 +1479,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Wind/Solar" }, @@ -1355,7 +1490,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "60" }, @@ -1365,7 +1501,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Hydro" }, @@ -1375,7 +1512,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": " Nuclear" }, @@ -1385,7 +1523,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "%" }, @@ -1395,7 +1534,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "50" }, @@ -1405,7 +1545,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "40" }, @@ -1415,7 +1556,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "30" }, @@ -1425,7 +1567,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "20" }, @@ -1435,7 +1578,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "10" }, @@ -1445,7 +1589,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "0" }, @@ -1455,7 +1600,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "France" }, @@ -1465,7 +1611,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Sweden" }, @@ -1475,7 +1622,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Switzerland" }, @@ -1485,7 +1633,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Figure 5. The importance of nuclear in ensuring clean energy systems in France, Sweden and Switzerland ix" }, @@ -1495,7 +1644,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "6" }, @@ -1505,7 +1655,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "600" }, @@ -1515,7 +1666,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "500" }, @@ -1525,7 +1677,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Non-hydro" }, @@ -1535,7 +1688,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "ren. & waste" }, @@ -1545,7 +1699,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "h W T" }, @@ -1555,7 +1710,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "400" }, @@ -1565,7 +1721,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "300" }, @@ -1575,7 +1732,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Nuclear" }, @@ -1585,7 +1743,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Natural gas" }, @@ -1595,7 +1754,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Hydro" }, @@ -1605,7 +1765,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "200" }, @@ -1615,7 +1776,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Oil" }, @@ -1625,7 +1787,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": " Coal" }, @@ -1635,7 +1798,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "100" }, @@ -1645,7 +1809,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "0" }, @@ -1655,7 +1820,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "1974" }, @@ -1665,7 +1831,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "1980 1985 1990 1995 2000 2005 2010" }, @@ -1675,7 +1842,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "2017" }, @@ -1685,7 +1853,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x" }, @@ -1695,7 +1864,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." }, @@ -1705,7 +1875,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "One fuel pellet contains as much energy as a tonne of coal" }, @@ -1715,7 +1886,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Unlike other power sources, nuclear energy helps us reduce our total footprint, going beyond just the environment. When accounting for factors such as cost (e.g. fuel and construction costs), carbon (lifecycle greenhouse gas emissions), water and land footprints, nuclear is far ahead of all other energy generators." }, @@ -1725,7 +1897,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Nuclear energy offers a multitude of services beyond just electricity. With nuclear, we can decarbonize the way we heat our homes, provide process heat for industry, and ensure access to clean water. As freshwater supplies come under increasing pressure worldwide, nuclear reactors can provide desalination, ensuring a reliable flow of fresh drinking water in areas where it is scarce." }, @@ -1735,7 +1908,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "7" }, @@ -1745,7 +1919,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Nuclear energy can be relied upon to power the new mobility revolution taking place. Every day, we use almost 20 million barrels of oil to power our vehicles. By swapping to an electric or hydrogen-powered transport fleet – all powered by the atom – we are able to address one of the key challenges to a sustainable economy." }, @@ -1755,7 +1930,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "We cannot afford to wait – we need every part of the puzzle to contribute towards solving some of the greatest challenges faced by humankind in a very long time. The impacts of climate change will hit the poorest and most vulnerable first, and failing to act will have significant humanitarian consequences." }, @@ -1765,7 +1941,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Nuclear power is the silent giant of today’s energy system – it runs quietly in the background, capable of delivering immense amounts of power, regardless of weather or season, allowing us to focus on everything else in life. It is a technology that is available now, and can be expanded quickly across the world to help us solve some of the most defining challenges we face. Nuclear energy holds the potential to herald a new, cleaner and truly sustainable world – enabling us to pass on a cleaner planet to our children." }, @@ -1775,7 +1952,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "References" }, @@ -1785,7 +1963,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "i" }, @@ -1795,7 +1974,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "ii" }, @@ -1805,7 +1985,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "iii" }, @@ -1815,7 +1996,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "iv" }, @@ -1825,7 +2007,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "v" }, @@ -1835,7 +2018,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "vi" }, @@ -1845,7 +2029,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "vii" }, @@ -1855,7 +2040,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ – Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions – with visual modification by World Nuclear Association. International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT – with visual modifications by World Nuclear Association. International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https://www.ipcc.ch/sr15/ International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity – 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf" }, @@ -1865,7 +2051,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "viii Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" }, @@ -1875,7 +2062,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "ix" }, @@ -1885,7 +2073,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "x" }, @@ -1895,7 +2084,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "and NRC SOARCA study 2015 International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview Ibid." }, @@ -1905,7 +2095,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Photo credits: Front cover: Mike Baird; page 2: Vattenfall; page 4: Getty Images; page 5: Adobe Stock; page 6: Rosatom; page 8: Dean Calma, IAEA; page 10: Kazatomprom; page 11: EDF." }, @@ -1915,7 +2106,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "8" }, @@ -1925,7 +2117,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom" }, @@ -1935,7 +2128,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" }, @@ -1945,7 +2139,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." }, @@ -1955,7 +2150,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "The Silent Giant © 2019 World Nuclear Association. Registered in England and Wales, company number 01215741" } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 6879697f80..2eb819dbb6 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -5,7 +5,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Recalibrating risk" }, @@ -15,7 +16,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 1, + "links": [] }, "text": "Putting nuclear risk in context and perspective" }, @@ -25,7 +27,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "© 2021 World Nuclear Association" }, @@ -35,7 +38,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 2, + "links": [] }, "text": "Registered in England and Wales, company number 01215741. This report represents the views of individual experts, but does not necessarily represent those of any of the World Nuclear Association’s individual member organizations." }, @@ -45,7 +49,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Executive Summary" }, @@ -55,7 +60,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Nuclear energy is crucial to meeting the world’s ever-increasing demand for energy, thanks to its ability to supply affordable, reliable, and sustainable electricity and heat. Despite the many benefits of nuclear energy, its deployment is hindered in some parts of the world due to long-standing misconceptions about its risks. Even with its safety record – unmatched by any other energy source – the perception of nuclear power as uniquely dangerous endures." }, @@ -65,7 +71,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "This is reflected in the regulatory burden placed on the nuclear industry, which is geared towards an “as low as possible” approach, demanding radiation levels to be far below the levels where health effects have been observed (and in many cases below natural background radiation). This has resulted in higher costs, without delivering any additional health benefits, and has resulted in policymakers choosing other, more risky energy sources. More often than not, those alternative energy sources have been fossil fuels, greatly exacerbating the well-known risks posed by air pollution and climate change." }, @@ -75,7 +82,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Expanding the use of nuclear energy is essential for solving some of the biggest challenges facing humanity. Nuclear power has already played a major role in avoiding the emission of air pollutants and greenhouse gases, a role that will have to be greatly expanded in the future to ensure global energy supplies are decarbonized by 2050. Nuclear energy will also play a major part in ensuring that the transition to a low-carbon future is done in an equitable fashion, providing people across the world with a high-powered and sustainable future." }, @@ -85,7 +93,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "In order to fully unlock the potential of the atom, it is crucial that the gap between perceived and actual risks is addressed. The window of opportunity to act on climate change and other global challenges is closing fast – we must not delay increasing the contribution of nuclear energy on the grounds of myths and misconceptions." }, @@ -95,7 +104,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." }, @@ -105,7 +115,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 3, + "links": [] }, "text": "1" }, @@ -115,7 +126,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Perceived versus actual risk" }, @@ -125,7 +137,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "It is widely accepted that humans have skewed perceptions of risks, and the way we respond to them is shaped by these perceptions, rather than the actual threats posed. Approximately 1.35 millioni people die every year because of traffic accidents, in comparison with 257 aviation fatalities in 2019ii, yet more people are nervous about flying, fearing a rare deadly crash, than being in a fatal traffic accident. These numbers tell a powerful and well-established story: evaluations of risk are largely the result of emotions, rather than logic or facts. Although it is hard to recognize and accept that our perceptions may mislead us and curtail effective decision making, this is a well-established characteristic of humanity." }, @@ -135,7 +148,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture’s depiction of radiation has played a role in ensuring that this discrepancy has remained, be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific." }, @@ -145,7 +159,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii" }, @@ -155,7 +170,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Rank Order Laypersons" }, @@ -165,7 +181,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "17" }, @@ -175,7 +192,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "22" }, @@ -185,7 +203,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "30" }, @@ -195,7 +214,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -205,7 +225,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "4" }, @@ -215,7 +236,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -225,7 +247,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -235,7 +258,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -245,7 +269,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1" }, @@ -255,7 +280,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "3" }, @@ -265,7 +291,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Electric power (non-nuclear)" }, @@ -275,7 +302,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Motor vehicles" }, @@ -285,7 +313,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Smoking" }, @@ -295,7 +324,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "X-rays" }, @@ -305,7 +335,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Vaccinations" }, @@ -315,7 +346,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Nuclear power" }, @@ -325,7 +357,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Handguns" }, @@ -335,7 +368,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "Experts" }, @@ -345,7 +379,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "25" }, @@ -355,7 +390,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "20" }, @@ -365,7 +401,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -375,7 +412,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "7" }, @@ -385,7 +423,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -395,7 +434,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -405,7 +445,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "4" }, @@ -415,7 +456,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1" }, @@ -425,7 +467,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "9" }, @@ -435,7 +478,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "" }, @@ -445,7 +489,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span." }, @@ -455,7 +500,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "In fact, scientific consensus is that when it comes to preventing exposure to radiation, nuclear power is much better than other electricity generators. A 2016 reportiii from the United Nations Scientific Committee on the Effects of Atomic Radiation (UNSCEAR) found that coal-generated electricity is responsible for more than half of the total global radiation exposure arising from electricity generation, while nuclear power contributed less than a fifth. Coal miners received high occupational exposure and workers in solar and wind farms received the highest occupational exposure associated with plant construction for the same amount of installed capacity." }, @@ -465,7 +511,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "1 The original study was published in 1978, but its findings have been confirmed by numerous studies since." }, @@ -475,7 +522,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 4 + "page_number": 4, + "links": [] }, "text": "2" }, @@ -485,7 +533,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Natural" }, @@ -495,7 +544,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Artificial" }, @@ -505,7 +555,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": " 48% Radon  14% Buildings & soil  12% Food & water  10% Cosmic  4% Thoron" }, @@ -515,7 +566,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": " 11% Medicine  0.4%  0.4% Miscellaneous  0.2% Occupational  0.04% Nuclear discharges" }, @@ -525,7 +577,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Fallout" }, @@ -535,7 +588,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Figure 2. Global average exposure from different sources of radiation" }, @@ -545,7 +599,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Fossil fuels – currently accounting for around 81% of total energy supplyiv – cause significant levels of emissions in terms of both greenhouse gases and air pollutants. Despite the serious and ongoing health and environmental harms caused by air pollution, it is often considered to be an inevitable consequence of economic development. Air pollution’s contribution to the burden of disease is profound, with an estimated 8.7 million people dying worldwide prematurely in 2018 alonev,vi. Despite this, it fails to induce the same fears and anxieties in people as nuclear energy does." }, @@ -555,7 +610,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi." }, @@ -565,7 +621,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "25" }, @@ -575,7 +632,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "24.6" }, @@ -585,7 +643,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "20" }, @@ -595,7 +654,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "18.4" }, @@ -605,7 +665,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "r a e y" }, @@ -615,7 +676,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "W T" }, @@ -625,7 +687,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "e" }, @@ -635,7 +698,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "15" }, @@ -645,7 +709,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "r e p s e i t i l" }, @@ -655,7 +720,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "a t a F" }, @@ -665,7 +731,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "10" }, @@ -675,7 +742,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "5" }, @@ -685,7 +753,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "4.6" }, @@ -695,7 +764,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "2.8" }, @@ -705,7 +775,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "0" }, @@ -715,7 +786,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "C oal" }, @@ -725,7 +797,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Oil" }, @@ -735,7 +808,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Bio m ass" }, @@ -745,7 +819,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "N atural gas" }, @@ -755,7 +830,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "0.07" }, @@ -765,7 +841,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Wind" }, @@ -775,7 +852,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "0.04" }, @@ -785,7 +863,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "H ydropo w er" }, @@ -795,7 +874,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "0.02" }, @@ -805,7 +885,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "S olar" }, @@ -815,7 +896,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "0.01" }, @@ -825,7 +907,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "N uclear" }, @@ -835,7 +918,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3" }, @@ -845,7 +929,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "Contrary to perceptions, nuclear is an incredibly safe source of energy (see Figure 3 for a comparison). What is also clear is that the continued use of alternative energy sources in preference to nuclear energy – in particular fossil fuels – poses a far greater risk to public health by significantly contributing to climate change and air pollution." }, @@ -855,7 +940,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the" }, @@ -865,7 +951,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "2012 UNSCEAR report and the 2015 US NRC SOARCA study." }, @@ -875,7 +962,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "links": [] }, "text": "3" }, @@ -885,7 +973,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "The low-dose question" }, @@ -895,7 +984,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Since the 1950s, the Linear No-Threshold (LNT) theory has been used to inform regulatory decisions, positing that any dose of radiation, regardless of the amount or the duration over which it is received, poses a risk. Assuming that LNT is correct, we should expect to see that people living in areas of the world where background doses are higher (e.g. India, Iran and northern Europe) have a higher incidence of cancer. However, despite people living in areas of the world where radiation doses are naturally higher than those that would be received in parts of the evacuation zones around Chernobyl and Fukushima Daiichi, there is no evidence that these populations exhibit any negative health effects. Living nearby a nuclear power plant on average exposes the local population to 0.00009mSv/year, which according to LNT would increase the risk of developing cancer by 0.00000045%. After Chernobyl, the average dose to those evacuated was 30mSv, which would theoretically increase the risk of cancer at some point in their lifetime by 0.15% (on top of the average baseline lifetime risk of cancer, which is 39.5% in the USviii, 50% in the UKix)." }, @@ -905,7 +995,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Since the 1980s, there has been considerable scientific debate as to whether the LNT theory is valid, following scientific breakthroughs within, for example, radiobiology and medicine. Indeed, the Chernobyl accident helped illuminate some of the issues associated with LNT. Multiplication of the low doses after the accident (many far too low to be of any health concern) with large populations – using the assumptions made by LNT – led to a large number of predicted cancer deaths, which have not, and likely will not materialize. This practice has been heavily criticized for being inappropriate in making risk assessments by UNSCEAR, the International Commission on Radiation Protection and a large number of independent scientists." }, @@ -915,7 +1006,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "Determining the precise risk (or lack thereof) of the extremely small radiation doses associated with the routine operations of nuclear power plants, the disposal of nuclear waste or even extremely rare nuclear accidents is a purely academic exercise, that tries to determine whether the risk is extremely low, too small to detect, or non- existent. The risks of low-level radiation pale in comparison to other societal risks such as obesity, smoking, and air pollution." }, @@ -925,7 +1017,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "By looking at radiation risks in isolation, we prolong the over-regulation of radiation in nuclear plants, driving up costs, whilst not delivering any additional health benefits, in turn incentivising the use of more harmful energy sources. A recalibration is required, and this can only done by ensuring a holistic approach to risk is taken." }, @@ -935,7 +1028,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 6 + "page_number": 6, + "links": [] }, "text": "4" }, @@ -945,7 +1039,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Adopting an all-hazards approach" }, @@ -955,7 +1050,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Contemporary debates around nuclear energy often reflect the precautionary principle, a problematic concept applied across a range of regulatory and policy issues. A ‘strong’ interpretation of the precautionary principle, or a ‘as low as possible’ approach to risk, dictates that regulation is required whenever there is a potential adverse health risk, even if the evidence is not certain and regardless of the cost of regulation." }, @@ -965,7 +1061,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "The overall regulatory philosophy, at least theoretically, used in the nuclear industry is the ALARA (As Low As Reasonably Achievable) principle, where any regulatory action on radiation should account for socio- economic benefits and costs, as opposed to making decisions based on radiation risks alone." }, @@ -975,7 +1072,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "However, the regulatory process and the policy debate around nuclear more broadly has long departed from the ALARA principle, no longer weighing cost versus benefits, or considering the overall advantages of nuclear energy, but rather looking at radiation in isolation. This has resulted in a subtle shift towards an ‘as low as possible’ mentality. Attempting to reduce radiation far below de facto safe levels has resulted in an escalation of costs and loss of public confidence, and in some cases has deprived communities of the many benefits nuclear energy provides. In practical terms, this has led to the continued use of more harmful energy sources, such as fossil fuels." }, @@ -985,7 +1083,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs – be they economic, environmental, or public health – associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power." }, @@ -995,7 +1094,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Plant-level production costs at market prices" }, @@ -1005,7 +1105,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Grid-level costs of the electricity system" }, @@ -1015,7 +1116,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Social and environmental costs of emissions, land-use, climate change, security of supply, etc." }, @@ -1025,7 +1127,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Figure 4. The different levels of cost associated with electricity generationx" }, @@ -1035,7 +1138,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "A more holistic regulatory process would be required, in which regulators move away from being siloed, looking at specific risks in isolation, with little regard for the greater picture. The move towards an all-hazard, holistic approach would require greater coordination between regulators, ensuring that the combined risks of a specific nuclear project are weighed against the risks posed by not advancing said project." }, @@ -1045,7 +1149,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "Equally, the adoption of an all-hazards approach means regulators should consider declaring when a risk is too low to be a public health concern, in line with what the U.S. Nuclear Regulatory Commission attempted to do with its Below Regulatory Concern policy statements in the 1980s and early 1990s. In the context of nuclear power, this means departing from the notion that LNT instils of no safe level of radiation, and adopting a regulatory framework which notes the impossibility of eradicating risks. Failing to do so will result in excessive regulation that continues to limit the full potential of nuclear power in tackling climate change and sees a continued reliance on objectively more harmful energy sources." }, @@ -1055,7 +1160,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 7, + "links": [] }, "text": "5" }, @@ -1065,7 +1171,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Recalibrating the risk conversation" }, @@ -1075,7 +1182,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "By looking at radiation risks in isolation, we have created something akin to a “radiation phobia”, that both directly and indirectly harms people around the world. For instance, it is well established that the vast majority of health impacts from Chernobyl and Fukushima Daiichi were not radiological, but rather psychosocial. There has been an observable and dramatic increase in depression, PTSD, substance abuse, and suicides following these events, which can be significantly attributed to the dissonance between the actual and perceived risks of radiation, and the stigmatization they caused." }, @@ -1085,7 +1193,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Similarly, many of the tremendous challenges the global community faces are significantly driven by this “radiation phobia”. Indeed, several of these issues have been considerably exacerbated by the fact that certain risks are given a disproportionate amount of focus, whereas others are de facto ignored. The global conversation around climate change is a prime example of this. The historical use of fossil fuels has contributed significantly to climate change through greenhouse gas emissions, causing unprecedented changes in the liveability of the Earth. By 2025, half of the world’s population will be living in water-stressed areas, as extreme heat and droughts are exacerbating water resources. Between 2030 and 2050, climate change is expected to be the cause of an additional 250,000 deaths per year, arising from malnutrition, malaria, diarrhoea and heat stressx. Yet, despite the huge risks associated with climate change, our addiction to coal, oil, and fossil gas remains, with fossil fuels providing 84% of global primary energy in 2019xii. The continued prioritization of fossil fuels at the expense of nuclear energy results in a considerable increase in the risks posed by climate change." }, @@ -1095,7 +1204,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "Equally, it is well established that living without access to electricity results in illness and death around the world, caused by everything from not having access to modern healthcare to household air pollution. As of today, 770 million people around the world do not have access to electricity, with over 75% of that population living in Sub-Saharan Africa. The world's poorest 4 billion people consume a mere 5% of the energy used in developed economies, and we need to find ways of delivering reliable electricity to the entire human population in a fashion that is sustainable. Household and ambient air pollution causes 8.7 million deaths each year, largely because of the continued use of fossil fuels. Widespread electrification is a key tool for delivering a just energy transition. Investment in nuclear, has become an urgent necessity. Discarding it, based on risk perceptions divorced from science, would be to abandon the moral obligation to ensure affordable, reliable, and sustainable energy for every community around the world." }, @@ -1105,7 +1215,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "links": [] }, "text": "6" }, @@ -1115,7 +1226,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Clearly, we have reached a point where we must establish a new conversation about the relative risks of using nuclear, especially when risks created by other energy sources are considered. We cannot address many of the global challenges we face without a significant increase in the use of nuclear energy. The detrimental effects of decades of looking at nuclear risks in isolation highlights just how crucial it is that regulators and policymakers change the way they view nuclear energy, and transition towards an all-hazards approach, ensuring that actions taken to mitigate risks do not result in creating more severe risks." }, @@ -1125,7 +1237,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "We must begin to holistically look at the severity of the consequences of maintaining the current energy production system, many of which are irreversible. The ways in which we address climate change and other issues of global importance must be sustainable and not create new hazards down the line. The reality is that nuclear has always been and remains an exceptionally safe source of energy, representing the lowest risk, the most sustainable, and the most affordable ways to generate around-the-clock electricity." }, @@ -1135,7 +1248,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." }, @@ -1145,7 +1259,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 9 + "page_number": 9, + "links": [] }, "text": "7" }, @@ -1155,7 +1270,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "References" }, @@ -1165,7 +1281,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "i" }, @@ -1175,7 +1292,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" }, @@ -1185,7 +1303,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "ii" }, @@ -1195,7 +1314,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" }, @@ -1205,7 +1325,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "iii" }, @@ -1215,7 +1336,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." }, @@ -1225,7 +1347,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" }, @@ -1235,7 +1358,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" }, @@ -1245,7 +1369,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "v" }, @@ -1255,7 +1380,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" }, @@ -1265,7 +1391,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "vi" }, @@ -1275,7 +1402,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" }, @@ -1285,7 +1413,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" }, @@ -1295,7 +1424,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" }, @@ -1305,7 +1435,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" }, @@ -1315,7 +1446,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "understanding/statistics" }, @@ -1325,7 +1457,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" }, @@ -1335,7 +1468,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "professional/cancer-statistics/risk" }, @@ -1345,7 +1479,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" }, @@ -1355,7 +1490,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "the-full-costs-of-electricity-provision?details=true" }, @@ -1365,7 +1501,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" }, @@ -1375,7 +1512,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "sheets/detail/climate-change-and-health" }, @@ -1385,7 +1523,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." }, @@ -1395,7 +1534,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." }, @@ -1405,7 +1545,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 10 + "page_number": 10, + "links": [] }, "text": "8" }, @@ -1415,7 +1556,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom" }, @@ -1425,7 +1567,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org" }, @@ -1435,7 +1578,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate." }, @@ -1445,7 +1589,8 @@ "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 12 + "page_number": 12, + "links": [] }, "text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741" } diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 088cfa170e..ba7ec592db 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -2,6 +2,9 @@ import re import sys import unicodedata +from typing import Tuple + +import numpy as np from unstructured.file_utils.encoding import ( format_encoding_str, @@ -412,3 +415,46 @@ def bytes_string_to_string(text: str, encoding: str = "utf-8"): text_bytes = bytes([ord(char) for char in text]) formatted_encoding = format_encoding_str(encoding) return text_bytes.decode(formatted_encoding) + + +def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: + """Cleans extra whitespace characters that appear between words. + Calculate distance between characters of original text and cleaned text. + + Returns cleaned text along with array of indices it has moved from original. + + Example + ------- + ITEM 1. BUSINESS -> ITEM 1. BUSINESS + array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])) + """ + + cleaned_text = re.sub(r"[\xa0\n]", " ", text) + cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text) + + cleaned_text = cleaned_text.strip() + + moved_indices = np.zeros(len(text)) + + distance, original_index, cleaned_index = 0, 0, 0 + while cleaned_index < len(cleaned_text): + if text[original_index] == cleaned_text[cleaned_index] or ( + bool(re.match("[\xa0\n]", text[original_index])) + and bool(re.match(" ", cleaned_text[cleaned_index])) + ): + moved_indices[cleaned_index] = distance + original_index += 1 + cleaned_index += 1 + continue + + distance += 1 + moved_indices[cleaned_index] = distance + original_index += 1 + + moved_indices[cleaned_index:] = distance + + return cleaned_text, moved_indices + + +def index_adjustment_after_clean_extra_whitespace(index, moved_indices) -> int: + return int(index - moved_indices[index]) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index de52e7447d..ed7317569a 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -129,6 +129,7 @@ class Link(TypedDict): text: Optional[str] url: str + start_index: int @dc.dataclass @@ -157,6 +158,7 @@ class ElementMetadata: url: Optional[str] = None link_urls: Optional[List[str]] = None link_texts: Optional[List[str]] = None + links: Optional[List[Link]] = None # E-mail specific metadata fields sent_from: Optional[List[str]] = None diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 75299fe898..3bfdb1e680 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -273,12 +273,13 @@ def doc_after_cleaners( def _get_links_from_tag(tag_elem: etree.Element) -> List[Link]: links: List[Link] = [] href = tag_elem.get("href") + # TODO(klaijan) - add html href start_index if href: - links.append({"text": tag_elem.text, "url": href}) + links.append({"text": tag_elem.text, "url": href, "start_index": -1}) for tag in tag_elem.iterdescendants(): href = tag.get("href") if href: - links.append({"text": tag.text, "url": href}) + links.append({"text": tag.text, "url": href, "start_index": -1}) return links diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 16fefa29f6..f1b03b63bd 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -1,24 +1,40 @@ +import contextlib import io import os import re import warnings from tempfile import SpooledTemporaryFile -from typing import IO, Any, BinaryIO, Iterator, List, Optional, Tuple, Union, cast +from typing import IO, Any, BinaryIO, Iterator, List, Optional, Sequence, Tuple, Union, cast +import numpy as np import pdf2image import PIL -from pdfminer.high_level import extract_pages -from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox +from pdfminer.converter import PDFPageAggregator, PDFResourceManager +from pdfminer.layout import ( + LAParams, + LTChar, + LTContainer, + LTImage, + LTItem, + LTTextBox, +) +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfpage import PDFPage +from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename from unstructured.chunking.title import add_chunking_strategy -from unstructured.cleaners.core import clean_extra_whitespace +from unstructured.cleaners.core import ( + clean_extra_whitespace_with_index_run, + index_adjustment_after_clean_extra_whitespace, +) from unstructured.documents.coordinates import PixelSpace, PointSpace from unstructured.documents.elements import ( CoordinatesMetadata, Element, ElementMetadata, Image, + Link, ListItem, PageBreak, Text, @@ -71,6 +87,7 @@ def partition_pdf( metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, chunking_strategy: Optional[str] = None, + links: Sequence[Link] = [], **kwargs, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. @@ -454,15 +471,42 @@ def _process_pdfminer_pages( elements: List[Element] = [] sort_mode = kwargs.get("sort_mode", SORT_MODE_XY_CUT) - for i, page in enumerate(extract_pages(fp)): # type: ignore - width, height = page.width, page.height + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + + for i, page in enumerate(PDFPage.get_pages(fp)): # type: ignore + interpreter.process_page(page) + page_layout = device.get_result() + + width, height = page_layout.width, page_layout.height - text_segments = [] page_elements = [] - for obj in page: - x1, y2, x2, y1 = obj.bbox - y1 = height - y1 - y2 = height - y2 + annotation_list = [] + + coordinate_system = PixelSpace( + width=width, + height=height, + ) + if page.annots: + annotation_list = get_uris(page.annots, height, coordinate_system, i + 1) + + for obj in page_layout: + x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + bbox = (x1, y1, x2, y2) + + urls_metadata = [] + + if len(annotation_list) > 0 and isinstance(obj, LTTextBox): + annotations_within_element = check_annotations_within_element( + annotation_list, + bbox, + i + 1, + ) + _, words = get_word_bounding_box_from_element(obj, height) + for annot in annotations_within_element: + urls_metadata.append(map_bbox_and_index(words, annot)) if hasattr(obj, "get_text"): _text_snippets = [obj.get_text()] @@ -471,13 +515,8 @@ def _process_pdfminer_pages( _text_snippets = re.split(PARAGRAPH_PATTERN, _text) for _text in _text_snippets: - _text = clean_extra_whitespace(_text) + _text, moved_indices = clean_extra_whitespace_with_index_run(_text) if _text.strip(): - text_segments.append(_text) - coordinate_system = PixelSpace( - width=width, - height=height, - ) points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) element = element_from_text( _text, @@ -488,11 +527,27 @@ def _process_pdfminer_pages( points=points, system=coordinate_system, ) + + links: List[Link] = [] + for url in urls_metadata: + with contextlib.suppress(IndexError): + links.append( + { + "text": url["text"], + "url": url["uri"], + "start_index": index_adjustment_after_clean_extra_whitespace( + url["start_index"], + moved_indices, + ), + }, + ) + element.metadata = ElementMetadata( filename=filename, page_number=i + 1, coordinates=coordinates_metadata, last_modified=metadata_last_modified, + links=links, ) page_elements.append(element) list_item = 0 @@ -533,7 +588,7 @@ def _process_pdfminer_pages( system=coordinate_system, ) page_element = list_page_element - updated_page_elements.pop() + updated_page_elements.pop(0) updated_page_elements.append(page_element) @@ -792,12 +847,215 @@ def check_coords_within_boundary( line_height = boundary_y_max - boundary_y_min x_within_boundary = ( - (coordinates.points[0][0] < boundary_x_min + (horizontal_threshold * line_width)) + (coordinates.points[0][0] > boundary_x_min - (horizontal_threshold * line_width)) and (coordinates.points[2][0] < boundary_x_max + (horizontal_threshold * line_width)) and (coordinates.points[0][0] >= boundary_x_min) ) y_within_boundary = ( coordinates.points[0][1] < boundary_y_max + (vertical_threshold * line_height) - ) and (coordinates.points[0][1] > boundary_y_min) + ) and (coordinates.points[0][1] > boundary_y_min - (vertical_threshold * line_height)) return x_within_boundary and y_within_boundary + + +def get_uris( + annots: Union[PDFObjRef, List[PDFObjRef]], + height: float, + coordinate_system: Union[PixelSpace, PointSpace], + page_number: int, +) -> List[dict]: + if isinstance(annots, List): + return get_uris_from_annots(annots, height, coordinate_system, page_number) + return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number) + + +def get_uris_from_annots( + annots: List[PDFObjRef], + height: Union[int, float], + coordinate_system: Union[PixelSpace, PointSpace], + page_number: int, +) -> List[dict]: + annotation_list = [] + for annotation in annots: + annotation_dict = try_resolve(annotation) + if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict: + continue + x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height) + uri_dict = try_resolve(annotation_dict["A"]) + uri_type = str(uri_dict["S"]) + + try: + if uri_type == "/'URI'": + uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8") + if uri_type == "/'GoTo'": + uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8") + except (KeyError, AttributeError, TypeError, UnicodeDecodeError): + uri = None + + points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) + + coordinates_metadata = CoordinatesMetadata( + points=points, + system=coordinate_system, + ) + + annotation_list.append( + { + "coordinates": coordinates_metadata, + "bbox": (x1, y1, x2, y2), + "type": uri_type, + "uri": uri, + "page_number": page_number, + }, + ) + return annotation_list + + +def try_resolve(annot: PDFObjRef): + try: + return annot.resolve() + except Exception: + return annot + + +def rect_to_bbox( + rect: Tuple[float, float, float, float], + height: float, +) -> Tuple[float, float, float, float]: + x1, y2, x2, y1 = rect + y1 = height - y1 + y2 = height - y2 + return (x1, y1, x2, y2) + + +def calculate_intersection_area( + bbox1: Tuple[float, float, float, float], + bbox2: Tuple[float, float, float, float], +) -> float: + x1_1, y1_1, x2_1, y2_1 = bbox1 + x1_2, y1_2, x2_2, y2_2 = bbox2 + + x_intersection = max(x1_1, x1_2) + y_intersection = max(y1_1, y1_2) + x2_intersection = min(x2_1, x2_2) + y2_intersection = min(y2_1, y2_2) + + if x_intersection < x2_intersection and y_intersection < y2_intersection: + intersection_area = calculate_bbox_area( + (x_intersection, y_intersection, x2_intersection, y2_intersection), + ) + return intersection_area + else: + return 0.0 + + +def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float: + x1, y1, x2, y2 = bbox + area = (x2 - x1) * (y2 - y1) + return area + + +def check_annotations_within_element( + annotation_list: List[dict], + element_bbox: Tuple[float, float, float, float], + page_number: int, + threshold: float = 0.9, +) -> List[dict]: + annotations_within_element = [] + for annotation in annotation_list: + if annotation["page_number"] == page_number and ( + calculate_intersection_area(element_bbox, annotation["bbox"]) + / calculate_bbox_area(annotation["bbox"]) + > threshold + ): + annotations_within_element.append(annotation) + return annotations_within_element + + +def get_word_bounding_box_from_element( + obj: LTTextBox, + height: float, +) -> Tuple[List[LTChar], List[dict]]: + characters = [] + words = [] + text_len = 0 + + for text_line in obj: + word = "" + x1, y1, x2, y2 = None, None, None, None + start_index = 0 + for index, character in enumerate(text_line): + if isinstance(character, LTChar): + characters.append(character) + char = character.get_text() + + if not char.strip(): + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + continue + + # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 + # will need to switch to some pattern matching once we support more languages + if index == 0: + isalnum = char.isalnum() + + if char.isalnum() != isalnum: + isalnum = char.isalnum() + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + + if len(word) == 0: + start_index = text_len + index + x1 = character.x0 + y2 = height - character.y0 + x2 = character.x1 + y1 = height - character.y1 + else: + x2 = character.x1 + y2 = height - character.y0 + + word += char + text_len += len(text_line) + return characters, words + + +def map_bbox_and_index(words: List[dict], annot: dict): + if len(words) == 0: + annot["text"] = "" + annot["start_index"] = -1 + return annot + + distance_from_bbox_start = np.sqrt( + (annot["bbox"][0] - np.array([word["bbox"][0] for word in words])) ** 2 + + (annot["bbox"][1] - np.array([word["bbox"][1] for word in words])) ** 2, + ) + distance_from_bbox_end = np.sqrt( + (annot["bbox"][2] - np.array([word["bbox"][2] for word in words])) ** 2 + + (annot["bbox"][3] - np.array([word["bbox"][3] for word in words])) ** 2, + ) + closest_start = try_argmin(distance_from_bbox_start) + closest_end = try_argmin(distance_from_bbox_end) + + # NOTE(klaijan) - get the word from closest start only if the end index comes after start index + text = "" + if closest_end >= closest_start: + for _ in range(closest_start, closest_end + 1): + text += " " + text += words[_]["text"] + else: + text = words[closest_start]["text"] + + annot["text"] = text.strip() + annot["start_index"] = words[closest_start]["start_index"] + return annot + + +def try_argmin(array: np.ndarray) -> int: + try: + return int(np.argmin(array)) + except IndexError: + return -1 From f34c277bcae7a666d34ea1d6a3cf70b668da0afe Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 27 Sep 2023 14:40:56 -0400 Subject: [PATCH 03/31] fix: add backwards compatibility to ElementMetadata (#1526) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/Unstructured-IO/unstructured-api/issues/237 The problem: The `ElementMetadata` class was not able to ignore fields that it didn't know about. This surfaced in `partition_via_api`, when the hosted api schema is newer than the local `unstructured` version. In `ElementMetadata.from_json()` we get errors such as `TypeError: __init__() got an unexpected keyword argument 'parent_id'`. The fix: The `from_json` methods for these dataclasses should drop any unexpected fields before calling `__init__`. To verify: This shouldn't throw an error ``` from unstructured.staging.base import elements_from_json import json test_api_result = json.dumps([ { "type": "Title", "element_id": "2f7cc75f6467bba468022c4c2875335e", "metadata": { "filename": "layout-parser-paper.pdf", "filetype": "application/pdf", "page_number": 1, "new_field": "foo", }, "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" } ]) elements = elements_from_json(text=test_api_result) print(elements) ``` --- CHANGELOG.md | 5 ++-- test_unstructured/documents/test_elements.py | 25 ++++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 13 ++++++++-- 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4569f5f12e..afd503a5f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev10 +## 0.10.17-dev12 ### Enhancements @@ -26,7 +26,8 @@ should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. * **Fixes Sphinx errors.** Fixes errors when running Sphinx `make html` and installs library to suppress warnings. - +* **Fixes a metadata backwards compatibility error** Problem: When calling `partition_via_api`, the hosted api may return an element schema that's newer than the current `unstructured`. In this case, metadata fields were added which did not exist in the local `ElementMetadata` dataclass, and `__init__()` threw an error. Fix: remove nonexistent fields before instantiating in `ElementMetadata.from_json()`. Importance: Crucial to avoid breaking changes when adding fields. + ## 0.10.16 diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 270e16fb68..c85f6c8495 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -14,6 +14,7 @@ UUID, CoordinatesMetadata, Element, + ElementMetadata, NoID, Text, ) @@ -186,3 +187,27 @@ def test_element_to_dict(): "element_id": "awt32t1", } assert element.to_dict() == expected + + +def test_metadata_from_dict_extra_fields(): + """ + Assert that the metadata classes ignore nonexistent fields. + This can be an issue when elements_from_json gets a schema + from the future. + """ + element_metadata = { + "new_field": "hello", + "data_source": { + "new_field": "world", + }, + "coordinates": { + "new_field": "foo", + }, + } + + metadata = ElementMetadata.from_dict(element_metadata) + metadata_dict = metadata.to_dict() + + assert "new_field" not in metadata_dict + assert "new_field" not in metadata_dict["coordinates"] + assert "new_field" not in metadata_dict["data_source"] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2eb7cd1430..c31fc303ad 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev10" # pragma: no cover +__version__ = "0.10.17-dev12" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index ed7317569a..f051e1b4f6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -50,7 +50,11 @@ def to_dict(self): @classmethod def from_dict(cls, input_dict): - return cls(**input_dict) + # Only use existing fields when constructing + supported_fields = [f.name for f in dc.fields(cls)] + args = {k: v for k, v in input_dict.items() if k in supported_fields} + + return cls(**args) @dc.dataclass @@ -214,7 +218,12 @@ def from_dict(cls, input_dict: Dict[str, Any]) -> Self: constructor_args["data_source"] = DataSourceMetadata.from_dict( constructor_args["data_source"], ) - return cls(**constructor_args) + + # Only use existing fields when constructing + supported_fields = [f.name for f in dc.fields(cls)] + args = {k: v for k, v in constructor_args.items() if k in supported_fields} + + return cls(**args) def merge(self, other: ElementMetadata): for k in self.__dict__: From b2839625672c21ee03618369c2b5c9d55f8ecb82 Mon Sep 17 00:00:00 2001 From: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com> Date: Wed, 27 Sep 2023 23:38:15 +0300 Subject: [PATCH 04/31] docs: update ingest readme (#1456) Closes https://github.com/Unstructured-IO/unstructured/issues/1070 This PR aims to update the ingest readme file based on the recent changes that the ingest module had. --- CHANGELOG.md | 8 ++++---- unstructured/__version__.py | 2 +- unstructured/ingest/README.md | 12 ++++++++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afd503a5f5..330070d7f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev12 +## 0.10.17-dev13 ### Enhancements @@ -9,7 +9,7 @@ * **Add document level language detection functionality.** Introduces the "auto" default for the languages param, which then detects the languages present in the document using the `langdetect` package. Adds the document languages as ISO 639-3 codes to the element metadata. Implemented only for the partition_text function to start. * **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape). * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index. -* **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). +* **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). ### Features @@ -23,11 +23,11 @@ * **Fixes SharePoint connector failures if any document has an unsupported filetype** Problem: Currently the entire connector ingest run fails if a single IngestDoc has an unsupported filetype. This is because a ValueError is raised in the IngestDoc's `__post_init__`. Fix: Adds a try/catch when the IngestConnector runs get_ingest_docs such that the error is logged but all processable documents->IngestDocs are still instantiated and returned. Importance: Allows users to ingest SharePoint content even when some files with unsupported filetypes exist there. * **Fixes Sharepoint connector server_path issue** Problem: Server path for the Sharepoint Ingest Doc was incorrectly formatted, causing issues while fetching pages from the remote source. Fix: changes formatting of remote file path before instantiating SharepointIngestDocs and appends a '/' while fetching pages from the remote source. Importance: Allows users to fetch pages from Sharepoint Sites. * **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class -should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class +should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. * **Fixes Sphinx errors.** Fixes errors when running Sphinx `make html` and installs library to suppress warnings. * **Fixes a metadata backwards compatibility error** Problem: When calling `partition_via_api`, the hosted api may return an element schema that's newer than the current `unstructured`. In this case, metadata fields were added which did not exist in the local `ElementMetadata` dataclass, and `__init__()` threw an error. Fix: remove nonexistent fields before instantiating in `ElementMetadata.from_json()`. Importance: Crucial to avoid breaking changes when adding fields. - + ## 0.10.16 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c31fc303ad..7490d266e7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev12" # pragma: no cover +__version__ = "0.10.17-dev13" # pragma: no cover diff --git a/unstructured/ingest/README.md b/unstructured/ingest/README.md index f024769219..4cb3b1832d 100644 --- a/unstructured/ingest/README.md +++ b/unstructured/ingest/README.md @@ -60,17 +60,29 @@ In checklist form, the above steps are summarized as: - [ ] Create a new module under [unstructured/ingest/connector/](unstructured/ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured/ingest/connector/github.py](unstructured/ingest/connector/github.py). - [ ] The subclass of `BaseIngestDoc` overrides `process_file()` if extra processing logic is needed other than what is provided by [auto.partition()](unstructured/partition/auto.py). + - [ ] If the IngestDoc relies on a connection or session that could be reused, the subclass of `BaseConnectorConfig` implements a session handle to manage connections. The ConnectorConfig subclass should also inherit from `ConfigSessionHandleMixin` and the IngestDoc subclass should also inherit from `IngestDocSessionHandleMixin`. Check [here](https://github.com/Unstructured-IO/unstructured/pull/1058/files#diff-dae96d30f58cffe1b348c036d006b48bdc7e2e47fbd7c8ec1c45d63face1542d) for a detailed example. + - [ ] The subclass of `BaseIngestDoc` implements relevant data source properties to include metadata. Check [this PR](https://github.com/Unstructured-IO/unstructured/pull/1283) for detailed examples. + - [ ] The field `record_locator` property should include all of the information required to be able to reach to the document in the source platform. + - [ ] Add the relevant decorators from `unstructured.ingest.error` on top of relevant methods to handle errors such as a source connection error, destination connection error, or a partition error. For examples, check [here](https://github.com/Unstructured-IO/unstructured/commit/92692ad8d7d5001601dd88fef869a29660f492cb). - [ ] Update [unstructured/ingest/cli](unstructured/ingest/cli) with support for the new connector. - [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script. - [ ] Add a script test_unstructured_ingest/test-ingest-\.sh. It's json output files should have a total of no more than 100K. - [ ] Git add the expected outputs under test_unstructured_ingest/expected-structured-output/\ so the above test passes in CI. - [ ] Add a line to [test_unstructured_ingest/test-ingest.sh](test_unstructured_ingest/test-ingest.sh) invoking the new test script. +- [ ] Make sure the tests for the connector are running and not skipped by reviewing the logs in CI. - [ ] If additional python dependencies are needed for the new connector: - [ ] Add them as an extra to [setup.py](unstructured/setup.py). - [ ] Update the Makefile, adding a target for `install-ingest-` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference. - [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports. - [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `GitHubConnector` should look like `@requires_dependencies(dependencies=["github"], extras="github")` - [ ] Run `make tidy` and `make check` to ensure linting checks pass. +- [ ] Update ingest documentation [here](https://github.com/Unstructured-IO/unstructured/tree/eb8ce8913729826b62fd4e1224f70d67c5289b9d/docs/source) +- [ ] For team members that are developing in the original repository: + - [ ] If there are secret variables created for the connector tests, make sure to: + - [ ] add the secrets into Github (contact someone with access) + - [ ] include the secret variables in [`ci.yml`](https://github.com/Unstructured-IO/unstructured/blob/eb8ce8913729826b62fd4e1224f70d67c5289b9d/.github/workflows/ci.yml) and [`ingest-test-fixtures-update-pr.yml`](https://github.com/Unstructured-IO/unstructured/blob/eb8ce8913729826b62fd4e1224f70d67c5289b9d/.github/workflows/ingest-test-fixtures-update-pr.yml) + - [ ] add a make install line in the workflow configurations to be able to provide the workflow machine with the required dependencies on the connector while testing + - [ ] Whenever necessary, use the [ingest update test fixtures](https://github.com/Unstructured-IO/unstructured/actions/workflows/ingest-test-fixtures-update-pr.yml) workflow to update the test fixtures. - [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured/ingest/interfaces.py](unstructured/ingest/interfaces.py) which is passed through [the CLI](unstructured/ingest/main.py): - [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured/ingest/main.py). - [ ] Unless `.reprocess` is `True`, then documents are always reprocessed. From 9836235ead23bb30e95be3e3362839671fa343ce Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:05:55 -0400 Subject: [PATCH 05/31] Chunking support for SharePoint Connector (#1548) ### Description Optionally adds in chunking to the CLI which adds a flag to trigger chunking and exposes the parameters used by the `chunk_by_title` method. Runs chunking before the embedding step. Opened to replace original PR https://github.com/Unstructured-IO/unstructured/pull/1531 --- CHANGELOG.md | 3 +- .../files/azure_cognitive_index_schema.json | 4 ++ .../test-ingest-sharepoint-embed-cog-index.sh | 2 + unstructured/__version__.py | 2 +- unstructured/embed/openai.py | 4 +- .../ingest/cli/cmds/azure_cognitive_search.py | 3 + unstructured/ingest/cli/cmds/sharepoint.py | 4 ++ unstructured/ingest/cli/interfaces.py | 68 ++++++++++++++++++- unstructured/ingest/connector/sharepoint.py | 17 +++++ .../ingest/doc_processor/generalized.py | 5 +- unstructured/ingest/interfaces.py | 26 ++++++- unstructured/ingest/runner/base_runner.py | 8 ++- unstructured/ingest/runner/sharepoint.py | 1 + 13 files changed, 137 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 330070d7f7..65a462ca81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev13 +## 0.10.17-dev14 ### Enhancements @@ -10,6 +10,7 @@ * **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape). * **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index. * **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents** Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). +* **Chunking support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally chunk the elements pulled out during partition via the chunking unstructured brick. This can be used as a stage before creating embeddings. ### Features diff --git a/test_unstructured_ingest/files/azure_cognitive_index_schema.json b/test_unstructured_ingest/files/azure_cognitive_index_schema.json index 2abdc7b1d4..d77fd8da32 100644 --- a/test_unstructured_ingest/files/azure_cognitive_index_schema.json +++ b/test_unstructured_ingest/files/azure_cognitive_index_schema.json @@ -109,6 +109,10 @@ } ] }, + { + "name": "languages", + "type": "Collection(Edm.String)" + }, { "name": "page_number", "type": "Edm.String" diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index af9d8f33ae..5ea8b9b416 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -77,6 +77,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --path "Shared Documents" \ --recursive \ --embedding-api-key "$OPENAI_API_KEY" \ + --chunk-elements \ + --chunk-multipage-sections \ azure-cognitive-search \ --key "$AZURE_SEARCH_API_KEY" \ --endpoint "$AZURE_SEARCH_ENDPOINT" \ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7490d266e7..f87f0c2764 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev13" # pragma: no cover +__version__ = "0.10.17-dev14" # pragma: no cover diff --git a/unstructured/embed/openai.py b/unstructured/embed/openai.py index dd5a360970..b79763f8ec 100644 --- a/unstructured/embed/openai.py +++ b/unstructured/embed/openai.py @@ -1,5 +1,5 @@ import types -from typing import List, Optional +from typing import List import numpy as np @@ -12,7 +12,7 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder): - def __init__(self, api_key: str, model_name: Optional[str] = "text-embedding-ada-002"): + def __init__(self, api_key: str, model_name: str = "text-embedding-ada-002"): self.api_key = api_key self.model_name = model_name self.initialize() diff --git a/unstructured/ingest/cli/cmds/azure_cognitive_search.py b/unstructured/ingest/cli/cmds/azure_cognitive_search.py index 22eded4373..241a66b2ba 100644 --- a/unstructured/ingest/cli/cmds/azure_cognitive_search.py +++ b/unstructured/ingest/cli/cmds/azure_cognitive_search.py @@ -9,6 +9,7 @@ log_options, ) from unstructured.ingest.cli.interfaces import ( + CliChunkingConfig, CliEmbeddingsConfig, CliMixin, CliPartitionConfig, @@ -74,6 +75,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options): read_config = CliReadConfig.from_dict(parent_options) partition_config = CliPartitionConfig.from_dict(parent_options) embedding_config = CliEmbeddingsConfig.from_dict(parent_options) + chunking_config = CliChunkingConfig.from_dict(parent_options) # Run for schema validation AzureCognitiveSearchCliWriteConfig.from_dict(options) runner = runner_map[source_cmd] @@ -93,6 +95,7 @@ def azure_cognitive_search_dest(ctx: click.Context, **options): writer_type="azure_cognitive_search", writer_kwargs=options, embedding_config=embedding_config, + chunking_config=chunking_config, ) runner_instance.run( **parent_options, diff --git a/unstructured/ingest/cli/cmds/sharepoint.py b/unstructured/ingest/cli/cmds/sharepoint.py index 2457f474c8..5027fe3a80 100644 --- a/unstructured/ingest/cli/cmds/sharepoint.py +++ b/unstructured/ingest/cli/cmds/sharepoint.py @@ -9,6 +9,7 @@ log_options, ) from unstructured.ingest.cli.interfaces import ( + CliChunkingConfig, CliEmbeddingsConfig, CliMixin, CliPartitionConfig, @@ -86,6 +87,7 @@ def sharepoint_source(ctx: click.Context, **options): read_config = CliReadConfig.from_dict(options) partition_config = CliPartitionConfig.from_dict(options) embedding_config = CliEmbeddingsConfig.from_dict(options) + chunking_config = CliChunkingConfig.from_dict(options) # Run for schema validation SharepointCliConfig.from_dict(options) sharepoint_runner = SharePoint( @@ -93,6 +95,7 @@ def sharepoint_source(ctx: click.Context, **options): partition_config=partition_config, verbose=verbose, embedding_config=embedding_config, + chunking_config=chunking_config, ) sharepoint_runner.run(**options) except Exception as e: @@ -109,5 +112,6 @@ def get_source_cmd() -> click.Group: CliReadConfig.add_cli_options(cmd) CliPartitionConfig.add_cli_options(cmd) CliEmbeddingsConfig.add_cli_options(cmd) + CliChunkingConfig.add_cli_options(cmd) cmd.params.append(click.Option(["-v", "--verbose"], is_flag=True, default=False)) return cmd diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 2190744b5b..7ec4660a6f 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -4,7 +4,13 @@ from dataclasses_json.core import Json, _decode_dataclass from unstructured.ingest.cli.cmds.utils import DelimitedString -from unstructured.ingest.interfaces import BaseConfig, EmbeddingConfig, PartitionConfig, ReadConfig +from unstructured.ingest.interfaces import ( + BaseConfig, + ChunkingConfig, + EmbeddingConfig, + PartitionConfig, + ReadConfig, +) class CliMixin: @@ -212,7 +218,7 @@ def from_dict( ): """ Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with embedding_ during CLI invocation but + This allows CLI arguments to be prepended with chunk_ during CLI invocation but doesn't require that as part of the field names in this class """ if isinstance(kvs, dict): @@ -225,3 +231,61 @@ def from_dict( return None return _decode_dataclass(cls, new_kvs, infer_missing) return _decode_dataclass(cls, kvs, infer_missing) + + +class CliChunkingConfig(ChunkingConfig, CliMixin): + @staticmethod + def add_cli_options(cmd: click.Command) -> None: + options = [ + click.Option( + ["--chunk-elements"], + is_flag=True, + default=False, + ), + click.Option( + ["--chunk-multipage-sections"], + is_flag=True, + default=False, + ), + click.Option( + ["--chunk-combine-under-n-chars"], + type=int, + default=500, + show_default=True, + ), + click.Option( + ["--chunk-new-after-n-chars"], + type=int, + default=1500, + show_default=True, + ), + ] + cmd.params.extend(options) + + @classmethod + def from_dict( + cls, + kvs: Json, + *, + infer_missing=False, + ): + """ + Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. + This allows CLI arguments to be prepended with chunking_ during CLI invocation but + doesn't require that as part of the field names in this class + """ + if isinstance(kvs, dict): + new_kvs = {} + if "chunk_elements" in kvs: + new_kvs["chunk_elements"] = kvs.pop("chunk_elements") + new_kvs.update( + { + k[len("chunking_") :]: v # noqa: E203 + for k, v in kvs.items() + if k.startswith("chunking_") + }, + ) + if len(new_kvs.keys()) == 0: + return None + return _decode_dataclass(cls, new_kvs, infer_missing) + return _decode_dataclass(cls, kvs, infer_missing) diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py index 0dacea83d4..9fdcf87c9e 100644 --- a/unstructured/ingest/connector/sharepoint.py +++ b/unstructured/ingest/connector/sharepoint.py @@ -5,6 +5,7 @@ from pathlib import Path from urllib.parse import urlparse +from unstructured.documents.elements import Element from unstructured.embed.interfaces import BaseEmbeddingEncoder from unstructured.file_utils.filetype import EXT_TO_FILETYPE from unstructured.ingest.error import SourceConnectionError @@ -12,6 +13,7 @@ BaseConnectorConfig, BaseIngestDoc, BaseSourceConnector, + ChunkingConfig, EmbeddingConfig, IngestDocCleanupMixin, SourceConnectorCleanupMixin, @@ -69,6 +71,19 @@ class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): file_path: str registry_name: str = "sharepoint" embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None + + def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: + if self.chunking_config: + logger.info( + "Running chunking to split up elements with config: " + f"{self.chunking_config.to_dict()}", + ) + chunked_elements = self.chunking_config.chunk(elements=elements) + logger.info(f"chunked {len(elements)} elements into {len(chunked_elements)}") + return chunked_elements + else: + return elements @property def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: @@ -244,6 +259,7 @@ def get_file(self): class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): connector_config: SimpleSharepointConfig embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None @requires_dependencies(["office365"], extras="sharepoint") def _list_files(self, folder, recursive) -> t.List["File"]: @@ -283,6 +299,7 @@ def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_pag is_page=is_page, file_path=file_path, embedding_config=self.embedding_config, + chunking_config=self.chunking_config, ) @requires_dependencies(["office365"], extras="sharepoint") diff --git a/unstructured/ingest/doc_processor/generalized.py b/unstructured/ingest/doc_processor/generalized.py index 849b53853c..f44b2fa8f4 100644 --- a/unstructured/ingest/doc_processor/generalized.py +++ b/unstructured/ingest/doc_processor/generalized.py @@ -62,8 +62,9 @@ def process_document(ingest_doc_json: str, **partition_kwargs) -> Optional[List[ doc.write_result() except Exception: # TODO(crag) save the exception instead of print? - logger.error(f"Failed to process {doc}", exc_info=True) + logger.error(f"Failed to process {doc}") + raise Exception finally: if doc: doc.cleanup_file() - return isd_elems_no_filename + return isd_elems_no_filename diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index c708938bfd..c76fdfb783 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -13,7 +13,8 @@ import requests from dataclasses_json import DataClassJsonMixin -from unstructured.documents.elements import DataSourceMetadata +from unstructured.chunking.title import chunk_by_title +from unstructured.documents.elements import DataSourceMetadata, Element from unstructured.embed.interfaces import BaseEmbeddingEncoder from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.ingest.error import PartitionError, SourceConnectionError @@ -78,6 +79,25 @@ def get_embedder(self) -> BaseEmbeddingEncoder: return OpenAIEmbeddingEncoder(**kwargs) +@dataclass +class ChunkingConfig(BaseConfig): + chunk_elements: bool = False + multipage_sections: bool = True + combine_under_n_chars: int = 500 + new_after_n_chars: int = 1500 + + def chunk(self, elements: t.List[Element]) -> t.List[Element]: + if self.chunk_elements: + return chunk_by_title( + elements=elements, + multipage_sections=self.multipage_sections, + combine_under_n_chars=self.combine_under_n_chars, + new_after_n_chars=self.new_after_n_chars, + ) + else: + return elements + + @dataclass class WriteConfig(BaseConfig): pass @@ -115,6 +135,9 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._date_processed = None + def run_chunking(self, elements: t.List[Element]) -> t.List[Element]: + return elements + @property def embedder(self) -> t.Optional[BaseEmbeddingEncoder]: return None @@ -263,6 +286,7 @@ def partition_file(self, **partition_kwargs) -> t.List[t.Dict[str, t.Any]]: if response.status_code != 200: raise RuntimeError(f"Caught {response.status_code} from API: {response.text}") elements = elements_from_json(text=json.dumps(response.json())) + elements = self.run_chunking(elements=elements) if self.embedder: logger.info("Running embedder to add vector content to elements") elements = self.embedder.embed_documents(elements) diff --git a/unstructured/ingest/runner/base_runner.py b/unstructured/ingest/runner/base_runner.py index 772e282f0d..c12bdce1e0 100644 --- a/unstructured/ingest/runner/base_runner.py +++ b/unstructured/ingest/runner/base_runner.py @@ -2,7 +2,12 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from unstructured.ingest.interfaces import EmbeddingConfig, PartitionConfig, ReadConfig +from unstructured.ingest.interfaces import ( + ChunkingConfig, + EmbeddingConfig, + PartitionConfig, + ReadConfig, +) @dataclass @@ -13,6 +18,7 @@ class Runner(ABC): writer_type: t.Optional[str] = None writer_kwargs: t.Optional[dict] = None embedding_config: t.Optional[EmbeddingConfig] = None + chunking_config: t.Optional[ChunkingConfig] = None @abstractmethod def run(self, *args, **kwargs): diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index a20e64bdf8..d5ab2ec940 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -51,6 +51,7 @@ def run( read_config=self.read_config, partition_config=self.partition_config, embedding_config=self.embedding_config, + chunking_config=self.chunking_config, ) dest_doc_connector = None From fd79c5262ce6ff50748f2cb3f6e3c3a5103d1df7 Mon Sep 17 00:00:00 2001 From: Trevor Bossert <37596773+tabossert@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:30:32 -0700 Subject: [PATCH 06/31] Bump Dockerfile to use latest base image (#1553) New base image includes security fixes. This is an ongoing process to remediate security issues as they are identified. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e4accbddec..0bc9faebbc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:rocky9.2-4@sha256:b1063ffbf08c3037ee211620f011dd05bd2da9287c6e6a3473b15c1597724e4b as base +FROM quay.io/unstructured-io/base-images:rocky9.2-5@sha256:1721c3b0711e4e90587e3b4917f1b616e4603ddf5b4986bfaa68d02d82a13aba as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html From 2e01c49d901bf13263f48858512b9391ffafa64c Mon Sep 17 00:00:00 2001 From: rvztz Date: Wed, 27 Sep 2023 18:46:01 -0600 Subject: [PATCH 07/31] feat: adds data source properties to `delta table` connector. (#1464) --- CHANGELOG.md | 4 +- ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json | 1 + ...d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json | 1 + unstructured/__version__.py | 2 +- unstructured/ingest/connector/delta_table.py | 45 ++++++++++--------- 13 files changed, 38 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65a462ca81..98311ce131 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.10.17-dev14 +## 0.10.17-dev15 ### Enhancements -* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index. * **Improves salesforce partitioning** Partitions Salesforce data as xlm instead of text for improved detail and flexibility. Partitions htmlbody instead of textbody for Salesforce emails. Importance: Allows all Salesforce fields to be ingested and gives Salesforce emails more detailed partitioning. diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json index 1b2c921b80..3734bfb66a 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.parquet", + "version": 264934223616864047145159629306912568989, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.604000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json index 0f3e9b3812..202391c129 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.parquet", + "version": 139732878514171884135017505553329458078, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.629000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json index fc8e77b1ea..e4004e0515 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.parquet", + "version": 94569544647555135566266174719335103474, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.634000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json index c30c964986..644a82922c 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.parquet", + "version": 153924277850028657610430472976884166368, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.609000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json index a89ae23fb0..3c3059e5c5 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.parquet", + "version": 106461216032689499284003671440554259965, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.599000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json index 58aaf20908..37d6988151 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.parquet", + "version": 164150003651878262139646734756859067992, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.614000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json index 313dd687ca..34bb7d179f 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.parquet", + "version": 117019847084687446803154205344125897829, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.619000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json index f63c3badd0..9046006c97 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.parquet", + "version": 93578343538662480706683120160579695806, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.624000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json index 05fd088f3b..59f11ba85a 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.parquet", + "version": 329407810704817028643559273505069222621, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.644000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json index 3604829e13..82acb28ae2 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.parquet", + "version": 127086160869624650753647884727730407942, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.639000" }, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f87f0c2764..01785404e1 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev14" # pragma: no cover +__version__ = "0.10.17-dev15" # pragma: no cover diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index 92594cad26..f471b2b06a 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -15,6 +15,7 @@ BaseSourceConnector, IngestDocCleanupMixin, SourceConnectorCleanupMixin, + SourceMetadata, WriteConfig, ) from unstructured.ingest.logger import logger @@ -50,26 +51,10 @@ def uri_filename(self) -> str: basename = os.path.basename(self.uri) return os.path.splitext(basename)[0] - @property - def source_url(self) -> t.Optional[str]: - """The url of the source document.""" - return self.uri - - @property - def date_created(self) -> t.Optional[str]: - """This is the creation time of the table itself, not the file or specific record""" - # TODO get creation time of file/record - return self.created_at - @property def filename(self): return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve() - @property - def date_modified(self) -> t.Optional[str]: - """The date the document was last modified on the source system.""" - return self.modified_date - @property def _output_filename(self): """Create filename document id combined with a hash of the query to uniquely identify @@ -80,11 +65,8 @@ def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) self._output_filename.parent.mkdir(parents=True, exist_ok=True) - @SourceConnectionError.wrap - @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["fsspec"], extras="delta-table") - def get_file(self): - import pyarrow.parquet as pq + def _get_fs_from_uri(self): from fsspec.core import url_to_fs try: @@ -94,6 +76,29 @@ def get_file(self): f"uri {self.uri} may be associated with a filesystem that " f"requires additional dependencies: {error}", ) + return fs + + def update_source_metadata(self, **kwargs): + fs = kwargs.get("fs", self._get_fs_from_uri()) + version = ( + fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "") + ) + file_exists = fs.exists(self.uri) + self.source_metadata = SourceMetadata( + date_created=self.created_at, + date_modified=self.modified_date, + version=version, + source_url=self.uri, + exists=file_exists, + ) + + @SourceConnectionError.wrap + @BaseIngestDoc.skip_if_file_exists + def get_file(self): + import pyarrow.parquet as pq + + fs = self._get_fs_from_uri() + self.update_source_metadata(fs=fs) logger.info(f"using a {fs} filesystem to collect table data") self._create_full_tmp_dir_path() logger.debug(f"Fetching {self} - PID: {os.getpid()}") From 62b055779269f9ba1e0616235a3b2d942f33ff3b Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 27 Sep 2023 19:49:21 -0500 Subject: [PATCH 08/31] build: ignore failing delta lake test ingest for now (#1557) --- test_unstructured_ingest/test-ingest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 926821943e..97a5917e5d 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -56,7 +56,7 @@ trap print_last_run EXIT for script in "${scripts[@]}"; do CURRENT_SCRIPT=$script - if [[ "$CURRENT_SCRIPT" == "test-ingest-notion.sh" ]]; then + if [[ "$CURRENT_SCRIPT" == "test-ingest-notion.sh" ]] || [[ "$CURRENT_SCRIPT" == "test-ingest-delta-table.sh" ]]; then echo "--------- RUNNING SCRIPT $script --- IGNORING FAILURES" set +e echo "Running ./test_unstructured_ingest/$script" From e5d08662d4b0f38ad5ff8e4548fd24d5f3d53506 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Wed, 27 Sep 2023 21:34:06 -0500 Subject: [PATCH 09/31] enhancement: memory efficient xml partitioning (#1547) Closes #1236. Partitions XML documents iteratively in most cases*, never loading the entire tree into memory. This ends up being much faster. (* The exception is when the argument `xml_path` is passed to filter elements. I was not able to find a way in Python to compare XPaths while streaming the elements, aside from writing a custom XPath parser. So the shortest way forward was to bite the bullet and load the whole tree in memory when filtering by XPath.) Memory usage is about 20% of usage on `main` when processing a 470MB XML file. Time to process is 10s vs 900s. Output is slightly different, but appears to be an improvement, adding lines of text that are skipped in current partitioning. No text is lost. --- CHANGELOG.md | 3 +- .../partition/test_xml_partition.py | 6 +- unstructured/__version__.py | 2 +- unstructured/partition/xml.py | 72 ++++++++++++------- 4 files changed, 51 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98311ce131..8367477168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.10.17-dev15 +## 0.10.17-dev16 ### Enhancements +* **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases. * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index. diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py index cebff68b64..a301a46870 100644 --- a/test_unstructured/partition/test_xml_partition.py +++ b/test_unstructured/partition/test_xml_partition.py @@ -38,7 +38,7 @@ def test_partition_xml_from_filename_with_metadata_filename(): ) def test_partition_xml_from_file(filename): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path) assert elements[0].text == "United States" @@ -47,7 +47,7 @@ def test_partition_xml_from_file(filename): def test_partition_xml_from_file_with_metadata_filename(): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml") - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test") assert elements[0].text == "United States" @@ -158,7 +158,7 @@ def test_partition_xml_from_filename_exclude_metadata(filename): ) def test_partition_xml_from_file_exclude_metadata(filename): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml( file=f, xml_keep_tags=False, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 01785404e1..c555eeb001 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev15" # pragma: no cover +__version__ = "0.10.17-dev16" # pragma: no cover diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index ff733300c5..f8bdc48211 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -1,6 +1,8 @@ -import xml.etree.ElementTree as ET +from io import BytesIO from tempfile import SpooledTemporaryFile -from typing import IO, BinaryIO, List, Optional, Union, cast +from typing import IO, BinaryIO, Iterator, List, Optional, Union, cast + +from lxml import etree from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( @@ -20,41 +22,57 @@ from unstructured.partition.text import element_from_text -def is_leaf(elem): - return not bool(elem) - - -def is_string(elem): - return isinstance(elem, str) or (hasattr(elem, "text") and isinstance(elem.text, str)) - - def get_leaf_elements( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, text: Optional[str] = None, - xml_path: str = ".", - xml_keep_tags: bool = False, -) -> List[Optional[str]]: + xml_path: Optional[str] = None, +) -> Iterator[Optional[str]]: + """Get leaf elements from the XML tree defined in filename, file, or text.""" exactly_one(filename=filename, file=file, text=text) if filename: - _, raw_text = read_txt_file(filename=filename) + return _get_leaf_elements(filename, xml_path=xml_path) elif file: - f = spooled_to_bytes_io_if_needed( - cast(Union[BinaryIO, SpooledTemporaryFile], file), + f = cast( + IO[bytes], + spooled_to_bytes_io_if_needed( + cast(Union[BinaryIO, SpooledTemporaryFile], file), + ), ) - _, raw_text = read_txt_file(file=f) - elif text: - raw_text = text + return _get_leaf_elements(f, xml_path=xml_path) + else: + b = BytesIO(bytes(cast(str, text), encoding="utf-8")) + return _get_leaf_elements(b, xml_path=xml_path) + + +def _get_leaf_elements( + file: Union[str, IO[bytes]], + xml_path: Optional[str] = None, +) -> Iterator[Optional[str]]: + """Parse the XML tree in a memory efficient manner if possible.""" + element_stack = [] + + element_iterator = etree.iterparse(file, events=("start", "end")) + # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream + # elements through in a memory efficient way, so we bite the bullet and load it all into + # memory. + if xml_path is not None: + _, element = next(element_iterator) + compiled_path = etree.XPath(xml_path) + element_iterator = (("end", el) for el in compiled_path(element)) + + for event, element in element_iterator: + if event == "start": + element_stack.append(element) - root = ET.fromstring(raw_text) - leaf_elements = [] + if event == "end": + if element.text is not None and element.text.strip(): + yield element.text - for elem in root.findall(xml_path): - for subelem in elem.iter(): - if is_leaf(subelem) and is_string(subelem.text): - leaf_elements.append(subelem.text) + element.clear() - return leaf_elements + while element_stack and element_stack[-1].getparent() is None: + element_stack.pop() @process_metadata() @@ -65,7 +83,7 @@ def partition_xml( file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, text: Optional[str] = None, xml_keep_tags: bool = False, - xml_path: str = ".", + xml_path: Optional[str] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, encoding: Optional[str] = None, From 792232dcc52093360f28ec48e1222dc6c3d9640a Mon Sep 17 00:00:00 2001 From: Trevor Bossert <37596773+tabossert@users.noreply.github.com> Date: Thu, 28 Sep 2023 16:18:14 -0700 Subject: [PATCH 10/31] Chore: move scarf to setup.py (#1569) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This also follows what I have seen as the recommend way to define a file package like this. Also bumps minor versions from pip compile Testing: `pip install -e .` Everything should build as normal `❯ pip install -e . Obtaining file:///Users/trevor/dev/unstructured Installing build dependencies ... done Checking if build backend supports build_editable ... done Getting requirements to build editable ... done Preparing editable metadata (pyproject.toml) ... done Collecting scarf@ https://packages.unstructured.io/scarf.tgz (from unstructured==0.10.17.dev16) Using cached https://packages.unstructured.io/scarf.tgz (1.1 kB) Installing build dependencies ... done Getting requirements to build wheel ... done Preparing metadata (pyproject.toml) ... done` When new release goes out, I will test just plain pip install to verify that functionality still works --- requirements/base.in | 3 --- requirements/base.txt | 2 -- setup.py | 7 ++++++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/base.in b/requirements/base.in index 2a0558154a..4a20b179c3 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -11,7 +11,4 @@ emoji dataclasses-json python-iso639 langdetect -# (Trevor): This is a simple hello world package that is used to track -# download count for this package using scarf. -https://packages.unstructured.io/scarf.tgz numpy \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index 0cbe2afbf1..cfc1b241da 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -50,8 +50,6 @@ regex==2023.8.8 # via nltk requests==2.31.0 # via -r requirements/base.in -scarf @ https://packages.unstructured.io/scarf.tgz - # via -r requirements/base.in six==1.16.0 # via langdetect soupsieve==2.5 diff --git a/setup.py b/setup.py index cbde874f62..7b0b900c46 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,12 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List entry_points={ "console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"], }, - install_requires=load_requirements(), + install_requires=[ + # (Trevor): This is a simple hello world package that is used to track + # download count for this package using scarf. + 'scarf @ https://packages.unstructured.io/scarf.tgz', + load_requirements() + ], extras_require={ # Document specific extra requirements "all-docs": all_doc_reqs, From 4e84e32ed0fd5606ac03e5ee8269d55e8e2aefa6 Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 28 Sep 2023 19:32:43 -0600 Subject: [PATCH 11/31] fix: Discord connector when a channel is not found. (#1480) --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/ingest/connector/discord.py | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8367477168..300c3e230c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev16 +## 0.10.17-dev17 ### Enhancements @@ -29,6 +29,7 @@ should be generated, however the Formula class inherits from Element instead of allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. * **Fixes Sphinx errors.** Fixes errors when running Sphinx `make html` and installs library to suppress warnings. * **Fixes a metadata backwards compatibility error** Problem: When calling `partition_via_api`, the hosted api may return an element schema that's newer than the current `unstructured`. In this case, metadata fields were added which did not exist in the local `ElementMetadata` dataclass, and `__init__()` threw an error. Fix: remove nonexistent fields before instantiating in `ElementMetadata.from_json()`. Importance: Crucial to avoid breaking changes when adding fields. +* **Fixes issue with Discord connector when a channel returns `None`** Problem: Getting the `jump_url` from a nonexistent Discord `channel` fails. Fix: property `jump_url` is now retrieved within the same context as the messages from the channel. Importance: Avoids cascading issues when the connector fails to fetch information about a Discord channel. ## 0.10.16 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c555eeb001..2f8c77de8b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev16" # pragma: no cover +__version__ = "0.10.17-dev17" # pragma: no cover diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py index 2f8f689195..d9b40d3bb6 100644 --- a/unstructured/ingest/connector/discord.py +++ b/unstructured/ingest/connector/discord.py @@ -77,6 +77,7 @@ def _get_messages(self): from discord.ext import commands messages: t.List[discord.Message] = [] + jumpurl: t.List[str] = [] intents = discord.Intents.default() intents.message_content = True bot = commands.Bot(command_prefix=">", intents=intents) @@ -88,15 +89,17 @@ async def on_ready(): if self.days: after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days) channel = bot.get_channel(int(self.channel)) + jumpurl.append(channel.jump_url) # type: ignore async for msg in channel.history(after=after_date): # type: ignore messages.append(msg) await bot.close() except Exception: logger.error("Error fetching messages") await bot.close() + raise bot.run(self.token) - jump_url = bot.get_channel(int(self.channel)).jump_url # type: ignore + jump_url = None if len(jumpurl) < 1 else jumpurl[0] return messages, jump_url def update_source_metadata(self, **kwargs): From cd8c6a2e0941af09426de749c0720a6e1af1e2e3 Mon Sep 17 00:00:00 2001 From: Yao You Date: Thu, 28 Sep 2023 21:41:18 -0500 Subject: [PATCH 12/31] fix: occasional SIGABRT with deltalake writer on Linux (#1567) - resolves an issue where occasionally deltalake writer results in SIGABRT event though the writer finished writing table properly on linux - this is first observed in ingest test - Putting the writer into a process mitigates this problem by forcing python to finish the deltalake rust backend to finish its tasks ## test To test this it is best to setup an instance on a Linux system since the problem has only been observed on Linux so far. Run ```bash PYTHONPATH=. ./unstructured/ingest/main.py delta-table --num-processes 2 --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth --table-uri ../tables/delta/ --preserve-downloads --verbose delta-table --write-column json_data --mode overwrite --table-uri file:///tmp/delta ``` Without this fix occasionally we'd encounter `SIGABTR`. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> --- CHANGELOG.md | 1 + test_unstructured_ingest/test-ingest.sh | 2 +- unstructured/ingest/connector/delta_table.py | 18 ++++++++++++++---- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 300c3e230c..fe3c444ef8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * **Fixes Sphinx errors.** Fixes errors when running Sphinx `make html` and installs library to suppress warnings. * **Fixes a metadata backwards compatibility error** Problem: When calling `partition_via_api`, the hosted api may return an element schema that's newer than the current `unstructured`. In this case, metadata fields were added which did not exist in the local `ElementMetadata` dataclass, and `__init__()` threw an error. Fix: remove nonexistent fields before instantiating in `ElementMetadata.from_json()`. Importance: Crucial to avoid breaking changes when adding fields. * **Fixes issue with Discord connector when a channel returns `None`** Problem: Getting the `jump_url` from a nonexistent Discord `channel` fails. Fix: property `jump_url` is now retrieved within the same context as the messages from the channel. Importance: Avoids cascading issues when the connector fails to fetch information about a Discord channel. +* **Fixes occasionally SIGABTR when writing table with `deltalake` on Linux** Problem: occasionally on Linux ingest can throw a `SIGABTR` when writing `deltalake` table even though the table was written correctly. Fix: put the writing function into a `Process` to ensure its execution to the fullest extent before returning to the main process. Importance: Improves stability of connectors using `deltalake` ## 0.10.16 diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 97a5917e5d..926821943e 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -56,7 +56,7 @@ trap print_last_run EXIT for script in "${scripts[@]}"; do CURRENT_SCRIPT=$script - if [[ "$CURRENT_SCRIPT" == "test-ingest-notion.sh" ]] || [[ "$CURRENT_SCRIPT" == "test-ingest-delta-table.sh" ]]; then + if [[ "$CURRENT_SCRIPT" == "test-ingest-notion.sh" ]]; then echo "--------- RUNNING SCRIPT $script --- IGNORING FAILURES" set +e echo "Running ./test_unstructured_ingest/$script" diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index f471b2b06a..976e5fbddf 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -3,6 +3,7 @@ import typing as t from dataclasses import dataclass from datetime import datetime as dt +from multiprocessing import Process from pathlib import Path import pandas as pd @@ -182,8 +183,17 @@ def write(self, docs: t.List[BaseIngestDoc]) -> None: f"writing {len(json_list)} rows to destination " f"table at {self.connector_config.table_uri}", ) - write_deltalake( - table_or_uri=self.connector_config.table_uri, - data=pd.DataFrame(data={self.write_config.write_column: json_list}), - mode=self.write_config.mode, + # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause + # ingest to fail, even though all tasks are completed normally. Putting the writer into a + # process mitigates this issue by ensuring python interpreter waits properly for deltalake's + # rust backend to finish + writer = Process( + target=write_deltalake, + kwargs={ + "table_or_uri": self.connector_config.table_uri, + "data": pd.DataFrame(data={self.write_config.write_column: json_list}), + "mode": self.write_config.mode, + }, ) + writer.start() + writer.join() From 94fbbed189515337cf825c1a6aeb52e03929ca12 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 28 Sep 2023 20:48:02 -0700 Subject: [PATCH 13/31] feat: bbox shrinking in xycut algo, better natural reading order (#1560) Closes GH Issue #1233. ### Summary - add functionality to shrink all bounding boxes along x and y axes (still centered around the same center point) before running xy-cut sort ### Evaluation Run the followin gcommand for this [PDF](https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/pastebin/patent-11723901-page2.pdf). PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py --- CHANGELOG.md | 1 + .../partition/pdf-image/test_pdf.py | 2 +- .../partition/utils/test_sorting.py | 27 + ...iomedical-Data-Scientists-2-pages.pdf.json | 48 +- .../azure/IRS-form-1987.pdf.json | 168 ++--- .../azure/IRS-form-1987.png.json | 36 +- .../biomed-api/65/11/main.PMC6312790.pdf.json | 162 ++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 142 ++-- .../07/07/sbaa031.073.PMC7234218.pdf.json | 16 +- .../layout-parser-paper.pdf.json | 406 ++++++------ .../biomed-api/65/11/main.PMC6312790.pdf.json | 30 +- .../biomed-api/75/29/main.PMC6312793.pdf.json | 92 +-- .../07/07/sbaa031.073.PMC7234218.pdf.json | 16 +- .../2023-Jan-economic-outlook.pdf.json | 212 +++--- .../small-pdf-set/Silent-Giant-(1).pdf.json | 108 +-- .../recalibrating-risk-report.pdf.json | 112 ++-- .../2023-Jan-economic-outlook.pdf.json | 622 +++++++++--------- .../small-pdf-set/Silent-Giant-(1).pdf.json | 382 +++++------ .../recalibrating-risk-report.pdf.json | 248 +++---- unstructured/partition/utils/sorting.py | 70 +- 20 files changed, 1484 insertions(+), 1416 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3c444ef8..c5b030b315 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* **Better detection of natural reading order in images and PDF's** The elements returned by partition better reflect natural reading order in some cases, particularly in complicated multi-column layouts, leading to better chunking and retrieval for downstream applications. Achieved by improving the `xy-cut` sorting to preprocess bboxes, shrinking all bounding boxes by 90% along x and y axes (still centered around the same center point), which allows projection lines to be drawn where not possible before if layout bboxes overlapped. * **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases. * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index d9540344ef..d5dfcb8189 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -479,7 +479,7 @@ def test_partition_pdf_fast_groups_text_in_text_box(): system=expected_coordinate_system_3, ), ) - assert elements[3] == Text("2.5", metadata=expected_elem_metadata_3) + assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3) def test_partition_pdf_with_metadata_filename( diff --git a/test_unstructured/partition/utils/test_sorting.py b/test_unstructured/partition/utils/test_sorting.py index 7bcf7a25d0..2000b4e3a3 100644 --- a/test_unstructured/partition/utils/test_sorting.py +++ b/test_unstructured/partition/utils/test_sorting.py @@ -5,10 +5,19 @@ from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT from unstructured.partition.utils.sorting import ( coord_has_valid_points, + coordinates_to_bbox, + shrink_bbox, sort_page_elements, ) +class MockCoordinatesMetadata(CoordinatesMetadata): + def __init__(self, points): + system = PixelSpace(width=300, height=500) + + super().__init__(points, system) + + def test_coord_valid_coordinates(): coordinates = CoordinatesMetadata([(1, 2), (3, 4), (5, 6), (7, 8)], PixelSpace) assert coord_has_valid_points(coordinates) is True @@ -98,3 +107,21 @@ def test_sort_basic_pos_coordinates(): sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements]) assert sorted_elem_text == "7 8 9" + + +def test_coordinates_to_bbox(): + coordinates_data = MockCoordinatesMetadata([(10, 20), (10, 200), (100, 200), (100, 20)]) + expected_result = (10, 20, 100, 200) + assert coordinates_to_bbox(coordinates_data) == expected_result + + +def test_shrink_bbox(): + bbox = (0, 0, 100, 100) + shrink_factor = 0.5 + expected_result = (25, 25, 75, 75) + assert shrink_bbox(bbox, shrink_factor) == expected_result + + bbox = (0, 0, 200, 100) + shrink_factor = 0.9 + expected_result = (10, 5, 190, 95) + assert shrink_bbox(bbox, shrink_factor) == expected_result diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index f3eba38c14..7cbf4decf9 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -266,8 +266,8 @@ "text": "Executive Summary" }, { - "type": "NarrativeText", - "element_id": "2364a6d2f9a3858d51d91b817732e6c9", + "type": "Title", + "element_id": "6712d87f1d156abf6171f700e2875889", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -282,11 +282,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs." + "text": "biomedical" }, { - "type": "Title", - "element_id": "6712d87f1d156abf6171f700e2875889", + "type": "NarrativeText", + "element_id": "2364a6d2f9a3858d51d91b817732e6c9", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -301,7 +301,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "biomedical" + "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs." }, { "type": "Title", @@ -836,8 +836,8 @@ "text": "The" }, { - "type": "NarrativeText", - "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8", + "type": "Title", + "element_id": "aa3b88196a6407c3866c85acdcc8c981", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -852,11 +852,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "required of" + "text": "Workforce" }, { - "type": "Title", - "element_id": "aa3b88196a6407c3866c85acdcc8c981", + "type": "NarrativeText", + "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -871,7 +871,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Workforce" + "text": "required of" }, { "type": "NarrativeText", @@ -1083,8 +1083,8 @@ "text": "b)" }, { - "type": "NarrativeText", - "element_id": "1117af46b0a22dd02d3869ab9738a8a8", + "type": "Title", + "element_id": "6b847a0ed0b2c484c73f2749e29b4db5", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1099,11 +1099,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." + "text": "into" }, { - "type": "Title", - "element_id": "6b847a0ed0b2c484c73f2749e29b4db5", + "type": "NarrativeText", + "element_id": "1117af46b0a22dd02d3869ab9738a8a8", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1118,7 +1118,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "into" + "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." }, { "type": "NarrativeText", @@ -1197,8 +1197,8 @@ "text": "c)" }, { - "type": "NarrativeText", - "element_id": "961a38da2886c3cc25091d912769aa0d", + "type": "Title", + "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1213,7 +1213,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad." + "text": "Desired" }, { "type": "NarrativeText", @@ -1235,8 +1235,8 @@ "text": "important skills that were mentioned multiple times in" }, { - "type": "Title", - "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385", + "type": "NarrativeText", + "element_id": "961a38da2886c3cc25091d912769aa0d", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1251,7 +1251,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Desired" + "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad." }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index 4355d36569..9a30d93103 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -229,7 +229,7 @@ }, { "type": "NarrativeText", - "element_id": "eb076cfd3d47e546c28611750afedc49", + "element_id": "0b320308ba52d4a9625d29cadfc941a9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -244,11 +244,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and" + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and" }, { "type": "NarrativeText", - "element_id": "0b320308ba52d4a9625d29cadfc941a9", + "element_id": "eb076cfd3d47e546c28611750afedc49", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -263,7 +263,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and" + "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and" }, { "type": "NarrativeText", @@ -305,7 +305,7 @@ }, { "type": "Title", - "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", + "element_id": "5756fb398995bb6518a87637f24f426e", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -320,11 +320,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." + "text": "Time and Place for Filing" }, { "type": "Title", - "element_id": "5756fb398995bb6518a87637f24f426e", + "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -339,7 +339,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Time and Place for Filing" + "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." }, { "type": "NarrativeText", @@ -494,8 +494,8 @@ "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block." }, { - "type": "ListItem", - "element_id": "ede9004eceddf828c2c928f62d0687a0", + "type": "Title", + "element_id": "f1a73e2204a114077f988c9da98d7f8b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -510,11 +510,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to" + "text": "Signature" }, { - "type": "Title", - "element_id": "f1a73e2204a114077f988c9da98d7f8b", + "type": "ListItem", + "element_id": "ede9004eceddf828c2c928f62d0687a0", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -529,7 +529,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Signature" + "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to" }, { "type": "Title", @@ -704,7 +704,7 @@ }, { "type": "NarrativeText", - "element_id": "751abc8c6a0fa412c3e8c18345f57f95", + "element_id": "678ecc0340dc8848f891bf12a555a3fd", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -719,11 +719,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." + "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time." }, { "type": "NarrativeText", - "element_id": "678ecc0340dc8848f891bf12a555a3fd", + "element_id": "751abc8c6a0fa412c3e8c18345f57f95", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -738,11 +738,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time." + "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." }, { - "type": "Title", - "element_id": "136a59b0c53731bc299206fda46e0888", + "type": "NarrativeText", + "element_id": "64758ada28beed36481b14ce8dc67472", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -757,11 +757,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section B-1" + "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T." }, { - "type": "NarrativeText", - "element_id": "e4a695ea83818204438fe08add6d1554", + "type": "Title", + "element_id": "53e33d10c9df4a570490182ccef0cd95", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -776,11 +776,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application." + "text": "Section C" }, { "type": "Title", - "element_id": "f63f53aab435b8c9789ab7d6b982db3f", + "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -795,11 +795,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Sections B-2 and B-3" + "text": "Section E" }, { - "type": "Title", - "element_id": "4688916bf1d6b205af02a0e954156688", + "type": "ListItem", + "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -814,11 +814,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C" + "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." }, { - "type": "NarrativeText", - "element_id": "aaf93c2be8f4f2db87bd760783fedfa5", + "type": "ListItem", + "element_id": "84cea2af17bb3760234b42f4ea78e175", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -833,11 +833,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities." + "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." }, { - "type": "NarrativeText", - "element_id": "e5bed7fe04dd22cabe5e5c0362d37743", + "type": "Title", + "element_id": "136a59b0c53731bc299206fda46e0888", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -852,11 +852,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—" + "text": "Section B-1" }, { - "type": "ListItem", - "element_id": "69bd87b2ad5873c030748e62adf61b89", + "type": "NarrativeText", + "element_id": "e4a695ea83818204438fe08add6d1554", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -871,11 +871,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method." + "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application." }, { - "type": "NarrativeText", - "element_id": "0607edfa2419dd0cdc80f457872fe238", + "type": "Title", + "element_id": "f63f53aab435b8c9789ab7d6b982db3f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -890,11 +890,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law," + "text": "Sections B-2 and B-3" }, { - "type": "NarrativeText", - "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd", + "type": "Title", + "element_id": "4688916bf1d6b205af02a0e954156688", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -909,11 +909,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)" + "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C" }, { "type": "NarrativeText", - "element_id": "64758ada28beed36481b14ce8dc67472", + "element_id": "aaf93c2be8f4f2db87bd760783fedfa5", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -928,11 +928,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T." + "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities." }, { - "type": "Title", - "element_id": "53e33d10c9df4a570490182ccef0cd95", + "type": "NarrativeText", + "element_id": "e5bed7fe04dd22cabe5e5c0362d37743", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -947,11 +947,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section C" + "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—" }, { - "type": "NarrativeText", - "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6", + "type": "ListItem", + "element_id": "69bd87b2ad5873c030748e62adf61b89", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -966,11 +966,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8." + "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method." }, { "type": "NarrativeText", - "element_id": "357d52f500b965abc29ea60039de4fd8", + "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -985,11 +985,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:" + "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8." }, { "type": "NarrativeText", - "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf", + "element_id": "357d52f500b965abc29ea60039de4fd8", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1004,11 +1004,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726." + "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:" }, { "type": "NarrativeText", - "element_id": "6028c579dc843bb5aa2c704f46085914", + "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1023,11 +1023,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event." + "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726." }, { - "type": "Title", - "element_id": "92e21a61e1d872dbbe3e3221a920b409", + "type": "NarrativeText", + "element_id": "6028c579dc843bb5aa2c704f46085914", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1042,11 +1042,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section D" + "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event." }, { - "type": "NarrativeText", - "element_id": "a8e72799229bc2d754f44ea167a6e7d6", + "type": "Title", + "element_id": "92e21a61e1d872dbbe3e3221a920b409", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1061,11 +1061,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods." + "text": "Section D" }, { "type": "Title", - "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", + "element_id": "32786e68a6fd82dc356d2d58bf283dc4", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1080,11 +1080,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section E" + "text": "Section G" }, { - "type": "ListItem", - "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", + "type": "NarrativeText", + "element_id": "fa41a857716f30d6bbee384eada72a90", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1099,11 +1099,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." + "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)." }, { - "type": "ListItem", - "element_id": "84cea2af17bb3760234b42f4ea78e175", + "type": "Title", + "element_id": "a8155ab3bed92cc259ab58331619e0e1", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1118,11 +1118,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." + "text": "Section H" }, { - "type": "Title", - "element_id": "32786e68a6fd82dc356d2d58bf283dc4", + "type": "NarrativeText", + "element_id": "cb1f664a186a87f6560cde136d70b558", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1137,11 +1137,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section G" + "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested." }, { "type": "NarrativeText", - "element_id": "fa41a857716f30d6bbee384eada72a90", + "element_id": "86d11953bb813a770ecd242ff97d4e43", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1156,11 +1156,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)." + "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10." }, { - "type": "Title", - "element_id": "a8155ab3bed92cc259ab58331619e0e1", + "type": "NarrativeText", + "element_id": "0607edfa2419dd0cdc80f457872fe238", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1175,11 +1175,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section H" + "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law," }, { "type": "NarrativeText", - "element_id": "cb1f664a186a87f6560cde136d70b558", + "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1194,11 +1194,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested." + "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)" }, { "type": "NarrativeText", - "element_id": "86d11953bb813a770ecd242ff97d4e43", + "element_id": "a8e72799229bc2d754f44ea167a6e7d6", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1213,7 +1213,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10." + "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods." }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index f89aa759ad..5afaa3fefc 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -266,8 +266,8 @@ "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" }, { - "type": "NarrativeText", - "element_id": "b07efea243933525e9ec04a90622508d", + "type": "Title", + "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -282,11 +282,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required" + "text": "Uniform capitalization rules and limitation on" }, { - "type": "Title", - "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", + "type": "NarrativeText", + "element_id": "b07efea243933525e9ec04a90622508d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -301,7 +301,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on" + "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required" }, { "type": "NarrativeText", @@ -475,8 +475,8 @@ "text": "If your application is filed after the 180-day period, itis late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63." }, { - "type": "NarrativeText", - "element_id": "ec3c2d03b846d2a186fc9a8f318f688b", + "type": "Title", + "element_id": "025a65465b6fd9635316e92633b24c7e", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -491,11 +491,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." + "text": "Identifying Number" }, { - "type": "Title", - "element_id": "025a65465b6fd9635316e92633b24c7e", + "type": "NarrativeText", + "element_id": "ec3c2d03b846d2a186fc9a8f318f688b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -510,7 +510,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Identifying Number" + "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." }, { "type": "NarrativeText", @@ -532,8 +532,8 @@ "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block," }, { - "type": "ListItem", - "element_id": "f8e8c87d2e958a23153d7f25b159f0ee", + "type": "Title", + "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -548,11 +548,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + "text": "Signature tea" }, { - "type": "Title", - "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9", + "type": "ListItem", + "element_id": "f8e8c87d2e958a23153d7f25b159f0ee", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -567,7 +567,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Signature tea" + "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index af074dbe60..9c98b4af47 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -580,14 +580,14 @@ "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." }, { - "type": "UncategorizedText", - "element_id": "bbf3f11cb5b43e700273a78d12de55e4", + "type": "Title", + "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "%" + "text": "i" }, { "type": "NarrativeText", @@ -600,24 +600,24 @@ "text": ") r a e y / m m" }, { - "type": "UncategorizedText", - "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca", + "type": "NarrativeText", + "element_id": "49e7364ce1027887460959b2a757b184", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "(" + "text": "( e t a r n o s o r r o C" }, { "type": "NarrativeText", - "element_id": "49e7364ce1027887460959b2a757b184", + "element_id": "74599fca46202613cccb12e97774b306", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "( e t a r n o s o r r o C" + "text": "E n o i t i b h n I" }, { "type": "Title", @@ -639,55 +639,45 @@ }, "text": "i" }, - { - "type": "UncategorizedText", - "element_id": "ba5ec51d07a4ac0e951608704431d59a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": ")" - }, { "type": "NarrativeText", - "element_id": "74599fca46202613cccb12e97774b306", + "element_id": "bbe120714b80df07396e808f98b3f354", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "E n o i t i b h n I" + "text": "y c n e c i f f" }, { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "type": "UncategorizedText", + "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "i" + "text": "(" }, { - "type": "NarrativeText", - "element_id": "bbe120714b80df07396e808f98b3f354", + "type": "UncategorizedText", + "element_id": "bbf3f11cb5b43e700273a78d12de55e4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "y c n e c i f f" + "text": "%" }, { "type": "UncategorizedText", - "element_id": "525fbe4b6760bd759bfeeae2ee487f12", + "element_id": "ba5ec51d07a4ac0e951608704431d59a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1" + "text": ")" }, { "type": "UncategorizedText", @@ -731,23 +721,23 @@ }, { "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "element_id": "525fbe4b6760bd759bfeeae2ee487f12", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "10" + "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1" }, { "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "element_id": "4a44dc15364204a80fe80e9039455cc1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "20" + "text": "10" }, { "type": "UncategorizedText", @@ -761,73 +751,73 @@ }, { "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", + "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "40" + "text": "20" }, { "type": "UncategorizedText", - "element_id": "1a6562590ef19d1045d06c4055742d38", + "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "50" + "text": "90" }, { "type": "UncategorizedText", - "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5", + "element_id": "d59eced1ded07f84c145592f65bdf854", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "70" + "text": "40" }, { "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", + "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "60" + "text": "80" }, { "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", + "element_id": "1a6562590ef19d1045d06c4055742d38", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "80" + "text": "50" }, { "type": "UncategorizedText", - "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", + "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "90" + "text": "70" }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "0" + "text": "60" }, { "type": "UncategorizedText", @@ -939,6 +929,16 @@ }, "text": "2g 4g 6g 8g 10g" }, + { + "type": "UncategorizedText", + "element_id": "5feceb66ffc86f38d952786c6d696c79", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "0" + }, { "type": "UncategorizedText", "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", @@ -1110,14 +1110,14 @@ "text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution." }, { - "type": "Table", - "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe", + "type": "UncategorizedText", + "element_id": "9492908fadeab22ca81f18f2ba4f4f35", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919" + "text": "0 2 4 6 8 10" }, { "type": "Title", @@ -1129,15 +1129,25 @@ }, "text": "Inhibitor concentration (g)" }, + { + "type": "Table", + "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919" + }, { "type": "UncategorizedText", - "element_id": "9492908fadeab22ca81f18f2ba4f4f35", + "element_id": "12751f842ba5664e7ad255016dbe371b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0 2 4 6 8 10" + "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382" }, { "type": "Title", @@ -1151,13 +1161,13 @@ }, { "type": "UncategorizedText", - "element_id": "12751f842ba5664e7ad255016dbe371b", + "element_id": "727d4758bcfadaaf5156b8682cd39810", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382" + "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086" }, { "type": "Title", @@ -1170,34 +1180,34 @@ "text": "ba (V/dec)" }, { - "type": "UncategorizedText", - "element_id": "727d4758bcfadaaf5156b8682cd39810", + "type": "Title", + "element_id": "7bc31ed7ab5a625735657499f636c1f2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086" + "text": "Ecorr (V)" }, { - "type": "Title", - "element_id": "7bc31ed7ab5a625735657499f636c1f2", + "type": "UncategorizedText", + "element_id": "2a789110c863b30156d63234c8a51477", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Ecorr (V)" + "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356" }, { "type": "UncategorizedText", - "element_id": "2a789110c863b30156d63234c8a51477", + "element_id": "d71f426079cb8c2bb3d960ce1e23d290", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356" + "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05" }, { "type": "Title", @@ -1211,13 +1221,13 @@ }, { "type": "UncategorizedText", - "element_id": "d71f426079cb8c2bb3d960ce1e23d290", + "element_id": "1695e2ad2c62a337b135afbfc79ef69d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05" + "text": "24.0910 121.440 42.121 373.180 305.650 246.080" }, { "type": "Title", @@ -1231,13 +1241,13 @@ }, { "type": "UncategorizedText", - "element_id": "1695e2ad2c62a337b135afbfc79ef69d", + "element_id": "48bbf8e8b874e0e1f32be15f6c07c11c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "24.0910 121.440 42.121 373.180 305.650 246.080" + "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919" }, { "type": "Title", @@ -1249,16 +1259,6 @@ }, "text": "Corrosion rate (mm/year)" }, - { - "type": "UncategorizedText", - "element_id": "48bbf8e8b874e0e1f32be15f6c07c11c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919" - }, { "type": "NarrativeText", "element_id": "ef5851c1e7629b7329ac014d7fb9e9e1", @@ -1331,33 +1331,33 @@ }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "a0dfa682f99b0794f40f195f9a7adfcd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "—=—Cc/0 2+ T T T 1" }, { "type": "UncategorizedText", - "element_id": "a0dfa682f99b0794f40f195f9a7adfcd", + "element_id": "1797d9b8b07f302836186c20a19ebd0b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "—=—Cc/0 2+ T T T 1" + "text": "C/0" }, { "type": "UncategorizedText", - "element_id": "1797d9b8b07f302836186c20a19ebd0b", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "C/0" + "text": "2" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 59ec34c634..d7bdce8ec2 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -200,34 +200,34 @@ "text": "Specifications table" }, { - "type": "NarrativeText", - "element_id": "5c3978ebc42ea4f11240c221ac3be1cf", + "type": "Title", + "element_id": "41e0fa358cefcadbb2633ec45ff2d129", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" + "text": "Data format Experimental factors" }, { "type": "Title", - "element_id": "41e0fa358cefcadbb2633ec45ff2d129", + "element_id": "27d70c97431a2bec06d0a89368489dfb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Data format Experimental factors" + "text": "Experimental features Data source location Data accessibility Related research article" }, { - "type": "Title", - "element_id": "27d70c97431a2bec06d0a89368489dfb", + "type": "NarrativeText", + "element_id": "5c3978ebc42ea4f11240c221ac3be1cf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Experimental features Data source location Data accessibility Related research article" + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" }, { "type": "ListItem", @@ -259,16 +259,6 @@ }, "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" }, - { - "type": "NarrativeText", - "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" - }, { "type": "Title", "element_id": "bd7d750cb9f652c80c17a264072b8858", @@ -281,13 +271,13 @@ }, { "type": "NarrativeText", - "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", + "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" + "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" }, { "type": "Title", @@ -301,13 +291,13 @@ }, { "type": "NarrativeText", - "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", + "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." + "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" }, { "type": "NarrativeText", @@ -319,6 +309,16 @@ }, "text": "be used for the comparison." }, + { + "type": "NarrativeText", + "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." + }, { "type": "ListItem", "element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa", @@ -340,24 +340,24 @@ "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number" }, { - "type": "NarrativeText", - "element_id": "a18c70d23b71c51ddfe33311232c241c", + "type": "Title", + "element_id": "10c22bcf4c768b515be4e94bcafc71bf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." + "text": "for" }, { - "type": "Title", - "element_id": "10c22bcf4c768b515be4e94bcafc71bf", + "type": "NarrativeText", + "element_id": "a18c70d23b71c51ddfe33311232c241c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "for" + "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." }, { "type": "UncategorizedText", @@ -621,23 +621,23 @@ }, { "type": "NarrativeText", - "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "element_id": "1c59f2a7ce8a3fa55810df93d58e636e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." + "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." }, { - "type": "NarrativeText", - "element_id": "1c59f2a7ce8a3fa55810df93d58e636e", + "type": "Title", + "element_id": "252f10c83610ebca1a059c0bae8255eb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." + "text": "f" }, { "type": "NarrativeText", @@ -649,16 +649,6 @@ }, "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" }, - { - "type": "Title", - "element_id": "252f10c83610ebca1a059c0bae8255eb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "f" - }, { "type": "UncategorizedText", "element_id": "89507815c6b4a6f31e6d3da7fca6b561", @@ -689,6 +679,16 @@ }, "text": "." }, + { + "type": "NarrativeText", + "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 3 + }, + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." + }, { "type": "UncategorizedText", "element_id": "d8e33a2b60213fb3cebaf5c3a36b0b63", @@ -700,14 +700,14 @@ "text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size." }, { - "type": "Table", - "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6", + "type": "UncategorizedText", + "element_id": "6d1f07a97479928ee102d525dd11d2d7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60" + "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)" }, { "type": "Title", @@ -720,24 +720,24 @@ "text": "Instance size (m, n)" }, { - "type": "UncategorizedText", - "element_id": "6d1f07a97479928ee102d525dd11d2d7", + "type": "Table", + "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)" + "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60" }, { - "type": "Title", - "element_id": "47a68d3aa70030f2e7886e3f1cb07c69", + "type": "UncategorizedText", + "element_id": "1cb85e5f94671526c0cf38dc533f87e0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Average number of" + "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20" }, { "type": "Title", @@ -750,14 +750,14 @@ "text": "Locations" }, { - "type": "UncategorizedText", - "element_id": "1cb85e5f94671526c0cf38dc533f87e0", + "type": "Title", + "element_id": "47a68d3aa70030f2e7886e3f1cb07c69", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20" + "text": "Average number of" }, { "type": "Title", @@ -800,24 +800,24 @@ "text": "652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60" }, { - "type": "Title", - "element_id": "68ec9a56bde1cd8ea67340bf9cb829cb", + "type": "UncategorizedText", + "element_id": "4a30645cb68832ec26e551345d9cff0a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Possible empty travels" + "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60" }, { - "type": "UncategorizedText", - "element_id": "4a30645cb68832ec26e551345d9cff0a", + "type": "Title", + "element_id": "68ec9a56bde1cd8ea67340bf9cb829cb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60" + "text": "Possible empty travels" }, { "type": "NarrativeText", @@ -920,24 +920,24 @@ "text": "l" }, { - "type": "NarrativeText", - "element_id": "78f6ff03dfac8dfb7f319de1e369590d", + "type": "Title", + "element_id": "336074805fc853987abe6f7fe3ad97a6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." + "text": "time" }, { - "type": "Title", - "element_id": "336074805fc853987abe6f7fe3ad97a6", + "type": "NarrativeText", + "element_id": "78f6ff03dfac8dfb7f319de1e369590d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "time" + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." }, { "type": "Title", @@ -1081,23 +1081,23 @@ }, { "type": "NarrativeText", - "element_id": "16c341408703257ff517dcc76140e2c0", + "element_id": "c4f2c64b5f38feaa921647abceebaec8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling" + "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487." }, { "type": "NarrativeText", - "element_id": "c4f2c64b5f38feaa921647abceebaec8", + "element_id": "16c341408703257ff517dcc76140e2c0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487." + "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index c96928b601..24ce361e7b 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -60,34 +60,34 @@ "text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS" }, { - "type": "UncategorizedText", - "element_id": "e97f1cf1c49f397732e68cf1efb2355e", + "type": "NarrativeText", + "element_id": "d981d6dfaa8794c0bb733db0965b2831", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy" + "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford" }, { - "type": "NarrativeText", - "element_id": "1252f8d8921acac5f706e4402e504a75", + "type": "UncategorizedText", + "element_id": "e97f1cf1c49f397732e68cf1efb2355e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC." + "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy" }, { "type": "NarrativeText", - "element_id": "d981d6dfaa8794c0bb733db0965b2831", + "element_id": "1252f8d8921acac5f706e4402e504a75", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford" + "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 6f3354a254..30302a3ffa 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -49,6 +49,16 @@ }, "text": "1 2" }, + { + "type": "ListItem", + "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca" + }, { "type": "UncategorizedText", "element_id": "cfae0d4248f7142f7b17f826cd7a5192", @@ -79,16 +89,6 @@ }, "text": "2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a" }, - { - "type": "ListItem", - "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca" - }, { "type": "NarrativeText", "element_id": "be90d2640470e975e3402d19ba2c66cf", @@ -241,23 +241,23 @@ }, { "type": "Title", - "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba", + "element_id": "50f59772d4134ececeaf37069d480784", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "recognition, and other DIA tasks (Section 3)" + "text": "underlies the off-the-shelf usage" }, { "type": "Title", - "element_id": "50f59772d4134ececeaf37069d480784", + "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "underlies the off-the-shelf usage" + "text": "recognition, and other DIA tasks (Section 3)" }, { "type": "NarrativeText", @@ -301,23 +301,23 @@ }, { "type": "NarrativeText", - "element_id": "9b8fc4816306f4f1b31874d53134979b", + "element_id": "74a7758f83612467af8eea9d20e4a6f7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes." + "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned." }, { "type": "NarrativeText", - "element_id": "74a7758f83612467af8eea9d20e4a6f7", + "element_id": "9b8fc4816306f4f1b31874d53134979b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned." + "text": "The rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes." }, { "type": "Title", @@ -470,15 +470,14 @@ "text": "5" }, { - "type": "Table", - "element_id": "34923b77ca76e1808956ade5e766f7c2", + "type": "NarrativeText", + "element_id": "b51f99cb953082a922ba43c09d4492b3", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5, - "text_as_html": "
Dataset| Base Model'|Large ModelNotes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" + "page_number": 5 }, - "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" + "text": "Table 1: Current layout detection models in the LayoutParser model zoo" }, { "type": "NarrativeText", @@ -491,14 +490,15 @@ "text": "PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]" }, { - "type": "NarrativeText", - "element_id": "b51f99cb953082a922ba43c09d4492b3", + "type": "Table", + "element_id": "34923b77ca76e1808956ade5e766f7c2", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 5 + "page_number": 5, + "text_as_html": "
Dataset| Base Model'|Large ModelNotes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" }, - "text": "Table 1: Current layout detection models in the LayoutParser model zoo" + "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" }, { "type": "Title", @@ -561,34 +561,34 @@ "text": "1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months." }, { - "type": "NarrativeText", - "element_id": "11dff8778699e76422be6b86c9eaa62a", + "type": "Title", + "element_id": "9f26ca353a2c130a2e32f457d71c1350", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" + "text": "3.1 Layout Detection Models" }, { "type": "NarrativeText", - "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", + "element_id": "11dff8778699e76422be6b86c9eaa62a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." + "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" }, { - "type": "Title", - "element_id": "9f26ca353a2c130a2e32f457d71c1350", + "type": "NarrativeText", + "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "3.1 Layout Detection Models" + "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." }, { "type": "NarrativeText", @@ -661,34 +661,34 @@ "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" }, { - "type": "NarrativeText", - "element_id": "cafae07120d714f0822e89865adf62da", + "type": "Title", + "element_id": "acd4f4584a990134d927e19b6d7e5f88", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility." + "text": "3.2 Layout Data Structures" }, { "type": "NarrativeText", - "element_id": "7461d30ee7c51c91bca8003792d43bfe", + "element_id": "cafae07120d714f0822e89865adf62da", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5)." + "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility." }, { - "type": "Title", - "element_id": "acd4f4584a990134d927e19b6d7e5f88", + "type": "NarrativeText", + "element_id": "7461d30ee7c51c91bca8003792d43bfe", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "3.2 Layout Data Structures" + "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5)." }, { "type": "NarrativeText", @@ -721,34 +721,34 @@ "text": "7" }, { - "type": "NarrativeText", - "element_id": "e284bd66511cfa064681253e7ac57a9a", + "type": "Title", + "element_id": "89c6cd1d893f782ea68d75737e3393fd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" + "text": "3.3 OCR" }, { "type": "NarrativeText", - "element_id": "eec800eef6e395c21feacd729868dd18", + "element_id": "e284bd66511cfa064681253e7ac57a9a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort." + "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" }, { - "type": "Title", - "element_id": "89c6cd1d893f782ea68d75737e3393fd", + "type": "NarrativeText", + "element_id": "eec800eef6e395c21feacd729868dd18", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "3.3 OCR" + "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort." }, { "type": "NarrativeText", @@ -831,115 +831,115 @@ "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout." }, { - "type": "Table", - "element_id": "f81d4915b54758e0d4d52af3566bb813", + "type": "Title", + "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8, - "text_as_html": "
Operation NameDescription
block.pad(top, bottom,right,left)| Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio ; in x and y direction
. block.shift(dx, dy)Move the current block with the shift : : a distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
. block1. intersect (block2)Return the intersection region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
. block1.union(block2)Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
. block1.relative_to(block2)Convert the absolute coordinates of block to ' ' relative coordinates to block2
. block1.condition_on(block2)Calculate the absolute coordinates of blockl given . the canvas block2’s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
" + "page_number": 8 }, - "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region" + "text": "Operation Name" }, { "type": "Title", - "element_id": "2092f29df87c3cfd32244b325faaba33", + "element_id": "505791f52a5741b58f5dd02836da7b31", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.condition on(block2)" + "text": "block1.union(block2)" }, { "type": "Title", - "element_id": "aac9bbf1c375a005651b5d2929778d3b", + "element_id": "acfa5090fbb8986000a92d84d41d8140", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.relative to(block2)" + "text": "block1.is in(block2)" }, { "type": "Title", - "element_id": "505791f52a5741b58f5dd02836da7b31", + "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.union(block2)" + "text": "block.scale(fx, fy)" }, { "type": "Title", - "element_id": "39fca1b21a889218bd84127a4d7f27c5", + "element_id": "1c1464d6a8f85d78202f67293ee7ac42", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.intersect(block2)" + "text": "block.shift(dx, dy)" }, { "type": "Title", - "element_id": "1c1464d6a8f85d78202f67293ee7ac42", + "element_id": "39fca1b21a889218bd84127a4d7f27c5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block.shift(dx, dy)" + "text": "block1.intersect(block2)" }, { "type": "Title", - "element_id": "acfa5090fbb8986000a92d84d41d8140", + "element_id": "aac9bbf1c375a005651b5d2929778d3b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.is in(block2)" + "text": "block1.relative to(block2)" }, { "type": "Title", - "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be", + "element_id": "2092f29df87c3cfd32244b325faaba33", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block.scale(fx, fy)" + "text": "block1.condition on(block2)" }, { - "type": "NarrativeText", - "element_id": "f60c4482bfe6a1b0eb9095bb8cf21e64", + "type": "Table", + "element_id": "f81d4915b54758e0d4d52af3566bb813", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 8 + "page_number": 8, + "text_as_html": "
Operation NameDescription
block.pad(top, bottom,right,left)| Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio ; in x and y direction
. block.shift(dx, dy)Move the current block with the shift : : a distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
. block1. intersect (block2)Return the intersection region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
. block1.union(block2)Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
. block1.relative_to(block2)Convert the absolute coordinates of block to ' ' relative coordinates to block2
. block1.condition_on(block2)Calculate the absolute coordinates of blockl given . the canvas block2’s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
" }, - "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input" + "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region" }, { - "type": "Title", - "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75", + "type": "NarrativeText", + "element_id": "f60c4482bfe6a1b0eb9095bb8cf21e64", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Operation Name" + "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input" }, { "type": "Title", - "element_id": "7d52bf6c2abc8aebeda26c2400f00ddd", + "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block.crop image(image)" + "text": "Description" }, { "type": "NarrativeText", @@ -952,84 +952,84 @@ "text": "Whether block1 is inside of block2" }, { - "type": "Title", - "element_id": "fdf3d6c91387c02a0cdaa1ff6b3c67c5", + "type": "UncategorizedText", + "element_id": "a270fb0a45b9ed73f992f73dbf0b9a3f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Obtain the image segments in the block region" + "text": "Move the current block with the shift distances in x and y direction" }, { "type": "NarrativeText", - "element_id": "401c342fc214105b4a45dba74c62cae0", + "element_id": "494d23eb529015f662df16e6da39f810", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs." + "text": "Scale the current block given the ratio in x and y direction" }, { "type": "NarrativeText", - "element_id": "494d23eb529015f662df16e6da39f810", + "element_id": "d3b069f9dcc24bfac92a6de9e26f2501", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Scale the current block given the ratio in x and y direction" + "text": "Convert the absolute coordinates of block1 to relative coordinates to block2" }, { "type": "NarrativeText", - "element_id": "ec0a5482fa70f4d98212b6b3a748003a", + "element_id": "bb15ecc186d598c93a1cffa30e9e1b6e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Return the union region of block1 and block2. Coordinate type to be determined based on the inputs." + "text": "Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates" }, { "type": "NarrativeText", - "element_id": "d3b069f9dcc24bfac92a6de9e26f2501", + "element_id": "401c342fc214105b4a45dba74c62cae0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Convert the absolute coordinates of block1 to relative coordinates to block2" + "text": "Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs." }, { - "type": "Title", - "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", + "type": "NarrativeText", + "element_id": "ec0a5482fa70f4d98212b6b3a748003a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Description" + "text": "Return the union region of block1 and block2. Coordinate type to be determined based on the inputs." }, { - "type": "NarrativeText", - "element_id": "bb15ecc186d598c93a1cffa30e9e1b6e", + "type": "Title", + "element_id": "7d52bf6c2abc8aebeda26c2400f00ddd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates" + "text": "block.crop image(image)" }, { - "type": "UncategorizedText", - "element_id": "a270fb0a45b9ed73f992f73dbf0b9a3f", + "type": "Title", + "element_id": "fdf3d6c91387c02a0cdaa1ff6b3c67c5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Move the current block with the shift distances in x and y direction" + "text": "Obtain the image segments in the block region" }, { "type": "Title", @@ -1152,34 +1152,34 @@ "text": "After the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are significantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets." }, { - "type": "FigureCaption", - "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b", + "type": "UncategorizedText", + "element_id": "4a44dc15364204a80fe80e9039455cc1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance" + "text": "10" }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "type": "NarrativeText", + "element_id": "3993b330c2b3b86513c3edbcd33afc91", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "10" + "text": "Z. Shen et al." }, { - "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "type": "FigureCaption", + "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Z. Shen et al." + "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance" }, { "type": "NarrativeText", @@ -1363,23 +1363,23 @@ }, { "type": "NarrativeText", - "element_id": "9b51c55d2dd4ffd289138fc4f66e11e6", + "element_id": "164904dc2ff256763b3e64f1b56a784e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model." + "text": "To decipher the complicated layout" }, { "type": "NarrativeText", - "element_id": "164904dc2ff256763b3e64f1b56a784e", + "element_id": "9b51c55d2dd4ffd289138fc4f66e11e6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "To decipher the complicated layout" + "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model." }, { "type": "NarrativeText", @@ -1393,33 +1393,33 @@ }, { "type": "NarrativeText", - "element_id": "069379b2abcf2bed44f13bdaea90ec2d", + "element_id": "07be9fda679b805e67cf5e563eada033", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." + "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set." }, { "type": "NarrativeText", - "element_id": "d11adbfd88959ce24fbfdc7f8155e777", + "element_id": "069379b2abcf2bed44f13bdaea90ec2d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "16 This measures the overlap between the detected and ground-truth characters, and" + "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." }, { "type": "NarrativeText", - "element_id": "07be9fda679b805e67cf5e563eada033", + "element_id": "d11adbfd88959ce24fbfdc7f8155e777", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set." + "text": "16 This measures the overlap between the detected and ground-truth characters, and" }, { "type": "NarrativeText", @@ -1643,33 +1643,33 @@ }, { "type": "NarrativeText", - "element_id": "ad1bf75fc53d123c878f8254f9304c9f", + "element_id": "44c5093519506610b07942b24d966d77", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" + "text": "Hierarchical Image Database. In: CVPR09 (2009)" }, { "type": "NarrativeText", - "element_id": "c6e835fe03323406543926cc0f5a94de", + "element_id": "ad1bf75fc53d123c878f8254f9304c9f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" + "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" }, { "type": "NarrativeText", - "element_id": "44c5093519506610b07942b24d966d77", + "element_id": "c6e835fe03323406543926cc0f5a94de", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "Hierarchical Image Database. In: CVPR09 (2009)" + "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" }, { "type": "Title", @@ -1692,164 +1692,164 @@ "text": "15" }, { - "type": "Title", - "element_id": "9b9688203e9cdea89ded788342be4032", + "type": "UncategorizedText", + "element_id": "16390873ae6b6a173fc894a873bab022", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." + "text": "[9]" }, { "type": "NarrativeText", - "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb", + "element_id": "068bf90a7743f50c4a00d4827035e42f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" + "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" }, { "type": "NarrativeText", - "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107", + "element_id": "813cac1316043d454f3c928740435736", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" + "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" }, { "type": "NarrativeText", - "element_id": "be647bda3f1ca1b63554ef22d1313a43", + "element_id": "2f103adde52e35a8853cbb476720a6ef", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" + "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" }, { - "type": "NarrativeText", - "element_id": "09cfad31b28b1315b0bc7bd219136057", + "type": "Title", + "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" + "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" }, { "type": "NarrativeText", - "element_id": "80498c312fd32cb744e5953dfef18604", + "element_id": "124b6b55da69fccc1c06568bda34f63c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" + "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" }, { - "type": "NarrativeText", - "element_id": "3e0b97d540b7b43ad61292a89a58137f", + "type": "Title", + "element_id": "9b9688203e9cdea89ded788342be4032", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" + "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." }, { - "type": "NarrativeText", - "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98", + "type": "UncategorizedText", + "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" + "text": "2007(159), 2 (Jul 2007)" }, { "type": "NarrativeText", - "element_id": "aae12b8f70e03a3e35015ebda5974ebe", + "element_id": "3e0b97d540b7b43ad61292a89a58137f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" + "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" }, { "type": "NarrativeText", - "element_id": "068bf90a7743f50c4a00d4827035e42f", + "element_id": "80498c312fd32cb744e5953dfef18604", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" + "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" }, { "type": "NarrativeText", - "element_id": "813cac1316043d454f3c928740435736", + "element_id": "09cfad31b28b1315b0bc7bd219136057", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" + "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" }, { "type": "NarrativeText", - "element_id": "124b6b55da69fccc1c06568bda34f63c", + "element_id": "be647bda3f1ca1b63554ef22d1313a43", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" + "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" }, { - "type": "UncategorizedText", - "element_id": "16390873ae6b6a173fc894a873bab022", + "type": "NarrativeText", + "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[9]" + "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" }, { "type": "NarrativeText", - "element_id": "2f103adde52e35a8853cbb476720a6ef", + "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" + "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" }, { - "type": "UncategorizedText", - "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e", + "type": "NarrativeText", + "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "2007(159), 2 (Jul 2007)" + "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" }, { - "type": "Title", - "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3", + "type": "NarrativeText", + "element_id": "aae12b8f70e03a3e35015ebda5974ebe", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" + "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" }, { "type": "UncategorizedText", @@ -1862,84 +1862,94 @@ "text": "16" }, { - "type": "Title", - "element_id": "21d399ba787aabbf69a8ca861cbcc4a3", + "type": "NarrativeText", + "element_id": "3993b330c2b3b86513c3edbcd33afc91", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" + "text": "Z. Shen et al." }, { "type": "NarrativeText", - "element_id": "219033258f3fff3de33bed379610c8f3", + "element_id": "1abcfa28cce9b0f5194dec0d534f28e5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)" + "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)" }, { "type": "NarrativeText", - "element_id": "285ce5849d6fd9036e5d16724c024ab9", + "element_id": "f7c67eae65521c3a753337d08c5a7cc3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)" + "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)" }, { "type": "NarrativeText", - "element_id": "1abcfa28cce9b0f5194dec0d534f28e5", + "element_id": "4f43b2e563a35ae0208a8626f7e3280e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)" + "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)" }, { - "type": "NarrativeText", - "element_id": "f7c67eae65521c3a753337d08c5a7cc3", + "type": "UncategorizedText", + "element_id": "b66713d3f2d1689f9174e1cb87429eed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)" + "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning" + }, + { + "type": "UncategorizedText", + "element_id": "10a3ff59f6157f21733e659a41031f83", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" }, { "type": "NarrativeText", - "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2", + "element_id": "219033258f3fff3de33bed379610c8f3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)" + "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)" }, { "type": "NarrativeText", - "element_id": "4f43b2e563a35ae0208a8626f7e3280e", + "element_id": "285ce5849d6fd9036e5d16724c024ab9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)" + "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)" }, { - "type": "UncategorizedText", - "element_id": "b66713d3f2d1689f9174e1cb87429eed", + "type": "NarrativeText", + "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning" + "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)" }, { "type": "NarrativeText", @@ -1972,24 +1982,24 @@ "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" }, { - "type": "UncategorizedText", - "element_id": "10a3ff59f6157f21733e659a41031f83", + "type": "Title", + "element_id": "93d261a89a8422fb8d166e6cdf95d8f6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" + "text": "github.com/facebookresearch/detectron2 (2019)" }, { - "type": "Title", - "element_id": "462753569cb801c6f858759742a93793", + "type": "NarrativeText", + "element_id": "9dce913bddaa63724f5de64e539b7016", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" + "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" }, { "type": "Title", @@ -2001,35 +2011,25 @@ }, "text": "text and layout for document image understanding (2019)" }, - { - "type": "NarrativeText", - "element_id": "9dce913bddaa63724f5de64e539b7016", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" - }, { "type": "Title", - "element_id": "93d261a89a8422fb8d166e6cdf95d8f6", + "element_id": "21d399ba787aabbf69a8ca861cbcc4a3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "github.com/facebookresearch/detectron2 (2019)" + "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" }, { - "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "type": "Title", + "element_id": "462753569cb801c6f858759742a93793", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "Z. Shen et al." + "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json index 0f1f15711f..06c384a72c 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json @@ -195,25 +195,25 @@ }, { "type": "Title", - "element_id": "b27e559f6c00d2bde61efba5db252e31", + "element_id": "1064dcef42380cfdb90c668aa3a670a3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1, "links": [] }, - "text": "Materials engineering" + "text": "Table and figure" }, { "type": "Title", - "element_id": "1064dcef42380cfdb90c668aa3a670a3", + "element_id": "b27e559f6c00d2bde61efba5db252e31", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1, "links": [] }, - "text": "Table and figure" + "text": "Materials engineering" }, { "type": "Title", @@ -424,6 +424,17 @@ }, "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule" }, + { + "type": "UncategorizedText", + "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2, + "links": [] + }, + "text": "30" + }, { "type": "Title", "element_id": "e28e0dc941accc8694040c63091b580c", @@ -490,17 +501,6 @@ }, "text": "i" }, - { - "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2, - "links": [] - }, - "text": "30" - }, { "type": "UncategorizedText", "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json index bf9e4bf189..abcea312b5 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json @@ -265,37 +265,37 @@ "text": "Specifications table" }, { - "type": "NarrativeText", - "element_id": "5c3978ebc42ea4f11240c221ac3be1cf", + "type": "Title", + "element_id": "41e0fa358cefcadbb2633ec45ff2d129", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2, "links": [] }, - "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" + "text": "Data format Experimental factors" }, { "type": "Title", - "element_id": "41e0fa358cefcadbb2633ec45ff2d129", + "element_id": "27d70c97431a2bec06d0a89368489dfb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2, "links": [] }, - "text": "Data format Experimental factors" + "text": "Experimental features Data source location Data accessibility Related research article" }, { - "type": "Title", - "element_id": "27d70c97431a2bec06d0a89368489dfb", + "type": "NarrativeText", + "element_id": "5c3978ebc42ea4f11240c221ac3be1cf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2, "links": [] }, - "text": "Experimental features Data source location Data accessibility Related research article" + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" }, { "type": "NarrativeText", @@ -336,17 +336,6 @@ }, "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" }, - { - "type": "NarrativeText", - "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2, - "links": [] - }, - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" - }, { "type": "Title", "element_id": "bd7d750cb9f652c80c17a264072b8858", @@ -360,14 +349,14 @@ }, { "type": "NarrativeText", - "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", + "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2, "links": [] }, - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" + "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" }, { "type": "Title", @@ -382,14 +371,14 @@ }, { "type": "NarrativeText", - "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", + "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2, "links": [] }, - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." + "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" }, { "type": "NarrativeText", @@ -402,6 +391,17 @@ }, "text": "be used for the comparison." }, + { + "type": "NarrativeText", + "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2, + "links": [] + }, + "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." + }, { "type": "ListItem", "element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa", @@ -706,26 +706,26 @@ "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." }, { - "type": "NarrativeText", - "element_id": "928fa0dcad70f173bc989ee5715375c5", + "type": "Title", + "element_id": "252f10c83610ebca1a059c0bae8255eb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" + "text": "f" }, { - "type": "Title", - "element_id": "252f10c83610ebca1a059c0bae8255eb", + "type": "NarrativeText", + "element_id": "928fa0dcad70f173bc989ee5715375c5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "f" + "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" }, { "type": "UncategorizedText", @@ -1152,23 +1152,6 @@ }, "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling" }, - { - "type": "NarrativeText", - "element_id": "19dee0a4e8fd073350e234b4352b8af6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4, - "links": [ - { - "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime – spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 4 - } - ] - }, - "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur." - }, { "type": "UncategorizedText", "element_id": "bec40b25a277a08de3415e33284fc76d", @@ -1191,6 +1174,23 @@ }, "text": "problem, Networks 19 (5) (1989) 531–548." }, + { + "type": "NarrativeText", + "element_id": "19dee0a4e8fd073350e234b4352b8af6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4, + "links": [ + { + "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime – spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", + "start_index": 4 + } + ] + }, + "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur." + }, { "type": "UncategorizedText", "element_id": "5f5ca82752a3220998c06ea0c44eb80e", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 5844d4e791..29af05f0b2 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -67,36 +67,36 @@ }, { "type": "UncategorizedText", - "element_id": "e97f1cf1c49f397732e68cf1efb2355e", + "element_id": "5ce0f6dc16582eaf81312c412e99ebb9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1, "links": [] }, - "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy" + "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford" }, { - "type": "NarrativeText", - "element_id": "1252f8d8921acac5f706e4402e504a75", + "type": "UncategorizedText", + "element_id": "e97f1cf1c49f397732e68cf1efb2355e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1, "links": [] }, - "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC." + "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy" }, { - "type": "UncategorizedText", - "element_id": "5ce0f6dc16582eaf81312c412e99ebb9", + "type": "NarrativeText", + "element_id": "1252f8d8921acac5f706e4402e504a75", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1, "links": [] }, - "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford" + "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 8a2764011f..b9c9aa49b9 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -132,81 +132,81 @@ "text": "Monetary policy starts to bite. Signs are apparent that monetary policy tightening is starting to cool demand and inflation, but the full impact is unlikely to be realized before 2024. Global headline inflation appears to have peaked in the third quarter of 2022 (Figure 1). Prices of fuel and nonfuel commodities have declined, lowering headline inflation, notably in the United States, the euro area, and Latin America. But underlying (core) inflation has not yet peaked in most economies and remains well above pre-pandemic levels. It has persisted amid second-round effects from earlier cost shocks and tight labor markets with robust wage growth as consumer demand has remained resilient. Medium-term inflation expectations generally remain anchored, but some gauges are up. These developments have caused central banks to raise rates faster than expected, especially in the United States and the euro area, and to signal that rates will stay elevated for longer. Core inflation is declining in some economies that have completed their tightening cycle—such as Brazil. Financial markets are displaying high sensitivity to inflation news, with equity markets rising following recent releases of lower inflation data in anticipation of interest rate cuts (Box 1), despite central banks’ communicating their resolve to tighten policy further. With the peak in US headline inflation and an acceleration in rate hikes by several non-US central banks, the dollar has weakened since September but remains significantly stronger than a year ago." }, { - "type": "Title", - "element_id": "0cce65035ca66e9be782c845ddd606e2", + "type": "UncategorizedText", + "element_id": "808caaef5b114d874a25b7fec21b5516", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)" + "text": "18 16 14 12 10 8 6 4 2 0 –2" }, { "type": "UncategorizedText", - "element_id": "808caaef5b114d874a25b7fec21b5516", + "element_id": "28a5aa3897d66de6c31caba99a4c337e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "18 16 14 12 10 8 6 4 2 0 –2" + "text": "–2" }, { "type": "UncategorizedText", - "element_id": "28a5aa3897d66de6c31caba99a4c337e", + "element_id": "c2c7be4534a60790d1d18451c91dc138", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "–2" + "text": "16 14 12 10 8 6 4 2 0" }, { - "type": "NarrativeText", - "element_id": "e26dceaba57a5f670d91ac170e8706d1", + "type": "UncategorizedText", + "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies." + "text": "Jan. 2019" }, { "type": "UncategorizedText", - "element_id": "c2c7be4534a60790d1d18451c91dc138", + "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "16 14 12 10 8 6 4 2 0" + "text": "Jan. 2019" }, { - "type": "UncategorizedText", - "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", + "type": "Title", + "element_id": "0cce65035ca66e9be782c845ddd606e2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "Jan. 2019" + "text": "Figure 1. Twin Peaks? Headline and Core Inflation (Percent, year over year)" }, { - "type": "UncategorizedText", - "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", + "type": "NarrativeText", + "element_id": "e26dceaba57a5f670d91ac170e8706d1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3, "links": [] }, - "text": "Jan. 2019" + "text": "Sources: Haver Analytics; and IMF staff calculations. Note: The figure shows the developments in headline and core inflation across 18 advanced economies and 17 emerging market and developing economies. Core inflation is the change in prices for goods and services, but excluding those for food and energy (or the closest available measure). For the euro area (and other European countries for which the data are available), energy, food, alcohol, and tobacco are excluded. The gray bands depict the 10th to 90th percentiles of inflation across economies." }, { "type": "ListItem", @@ -595,25 +595,25 @@ }, { "type": "ListItem", - "element_id": "afde979c99a73646915fe253c85c5a9c", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" + "text": "" }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "afde979c99a73646915fe253c85c5a9c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "" + "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, { "type": "NarrativeText", @@ -628,25 +628,25 @@ }, { "type": "ListItem", - "element_id": "25e2f1dc031b5421b8a234945098e58b", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6, "links": [] }, - "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." + "text": "" }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "25e2f1dc031b5421b8a234945098e58b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6, "links": [] }, - "text": "" + "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." }, { "type": "NarrativeText", @@ -1123,25 +1123,25 @@ }, { "type": "Title", - "element_id": "24af2841400373443d80b6c91180918b", + "element_id": "e30a554d7d1cbf308651f8c267ad6872", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7, "links": [] }, - "text": "Middle East and Central Asia" + "text": "Brazil Mexico" }, { "type": "Title", - "element_id": "e30a554d7d1cbf308651f8c267ad6872", + "element_id": "24af2841400373443d80b6c91180918b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7, "links": [] }, - "text": "Brazil Mexico" + "text": "Middle East and Central Asia" }, { "type": "Title", @@ -1794,25 +1794,25 @@ }, { "type": "ListItem", - "element_id": "2d14934d52ff357c52e9ae1c38f7390e", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." + "text": "" }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "2d14934d52ff357c52e9ae1c38f7390e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "" + "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." }, { "type": "ListItem", @@ -2012,17 +2012,6 @@ }, "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." }, - { - "type": "NarrativeText", - "element_id": "e118be83abfed92b8969eca98bb4d53b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 11, - "links": [] - }, - "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." - }, { "type": "Title", "element_id": "57de33ba9eaa9e5980d4cf6da83abf46", @@ -2288,180 +2277,191 @@ "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." }, { - "type": "Title", - "element_id": "6ef230728534d871e5126e2a55e12b26", + "type": "NarrativeText", + "element_id": "e118be83abfed92b8969eca98bb4d53b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { - "type": "Title", - "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Latest" + "text": "6" }, { - "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "type": "UncategorizedText", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "October 2022 GFSR" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "6" + "text": "4" }, { - "type": "ListItem", - "element_id": "7d4f55875c970d850a152ba1d5ba02a5", + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "1. United States" + "text": "3" }, { - "type": "ListItem", - "element_id": "8e655408cf212df5f74df13e05cdf02c", + "type": "UncategorizedText", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "2. Euro area" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "5" + "text": "1" }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "Title", + "element_id": "6ef230728534d871e5126e2a55e12b26", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "4" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Title", + "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "3" + "text": "Latest" }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "Title", + "element_id": "53d79cec96694df67ce3baff95d8a2e3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "2" + "text": "October 2022 GFSR" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "ListItem", + "element_id": "7d4f55875c970d850a152ba1d5ba02a5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "1" + "text": "1. United States" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "ListItem", + "element_id": "8e655408cf212df5f74df13e05cdf02c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Oct. 22" + "text": "2. Euro area" }, { - "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "type": "UncategorizedText", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Apr. 23" + "text": "5" }, { - "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Oct. 23" + "text": "4" }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Dec. 24" + "text": "3" }, { - "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "type": "UncategorizedText", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "Dec. 26" + "text": "2" + }, + { + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 11, + "links": [] + }, + "text": "1" }, { "type": "Title", @@ -2519,59 +2519,59 @@ "text": "Dec. 26" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "5" + "text": "Oct. 22" }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "Title", + "element_id": "24a234895630131d612fc1b4605a256e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "4" + "text": "Apr. 23" }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Title", + "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "3" + "text": "Oct. 23" }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "Title", + "element_id": "d8478f45b9790d52201238244d0e9698", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "2" + "text": "Dec. 24" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11, "links": [] }, - "text": "1" + "text": "Dec. 26" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json index 18c9c5ac9e..2751529948 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -1079,179 +1079,179 @@ }, { "type": "Title", - "element_id": "f83714d89302473e0e4f5399bd50e7a9", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "W T" + "text": "e" }, { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", + "type": "UncategorizedText", + "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "e" + "text": "120" }, { - "type": "NarrativeText", - "element_id": "f9bb49945b60897227abdd75b5f8d39b", + "type": "UncategorizedText", + "element_id": "ad57366865126e55649ecb23ae1d4888", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "r e p s e i t i l" + "text": "100" }, { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", + "type": "UncategorizedText", + "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "a t a F" + "text": "120" }, { "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "element_id": "b725d20650649a5221675144bab5946e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "120" + "text": "99.5" }, { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", + "type": "Title", + "element_id": "f83714d89302473e0e4f5399bd50e7a9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "100" + "text": "W T" }, { - "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", + "type": "NarrativeText", + "element_id": "f9bb49945b60897227abdd75b5f8d39b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "80" + "text": "r e p s e i t i l" }, { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "60" + "text": "a t a F" }, { "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", + "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "40" + "text": "80" }, { "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "20" + "text": "60" }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "d59eced1ded07f84c145592f65bdf854", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "0" + "text": "40" }, { "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "element_id": "ce3201efc2e495241a85e4fc84575f50", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "120" + "text": "71.9" }, { - "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", + "type": "UncategorizedText", + "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "C oal" + "text": "20" }, { "type": "UncategorizedText", - "element_id": "b725d20650649a5221675144bab5946e", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "99.5" + "text": "0" }, { "type": "Title", - "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", + "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "Oil" + "text": "C oal" }, { - "type": "UncategorizedText", - "element_id": "ce3201efc2e495241a85e4fc84575f50", + "type": "Title", + "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "links": [] }, - "text": "71.9" + "text": "Oil" }, { "type": "Title", @@ -1694,59 +1694,59 @@ "text": "ren. & waste" }, { - "type": "Title", - "element_id": "563a2980d46c81119e1d7d952b375a41", + "type": "UncategorizedText", + "element_id": "26d228663f13a88592a12d16cf9587ca", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9, "links": [] }, - "text": "h W T" + "text": "400" }, { - "type": "UncategorizedText", - "element_id": "26d228663f13a88592a12d16cf9587ca", + "type": "Title", + "element_id": "f35457739b3bd74c61625c986c844726", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9, "links": [] }, - "text": "400" + "text": " Nuclear" }, { - "type": "UncategorizedText", - "element_id": "983bd614bb5afece5ab3b6023f71147c", + "type": "Title", + "element_id": "f6e172956a9472fa43f9a895f99c2836", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9, "links": [] }, - "text": "300" + "text": " Natural gas" }, { "type": "Title", - "element_id": "f35457739b3bd74c61625c986c844726", + "element_id": "563a2980d46c81119e1d7d952b375a41", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9, "links": [] }, - "text": " Nuclear" + "text": "h W T" }, { - "type": "Title", - "element_id": "f6e172956a9472fa43f9a895f99c2836", + "type": "UncategorizedText", + "element_id": "983bd614bb5afece5ab3b6023f71147c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9, "links": [] }, - "text": " Natural gas" + "text": "300" }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 2eb819dbb6..a30000b49a 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -155,36 +155,36 @@ }, { "type": "Title", - "element_id": "4d7c9c95f808a09f6b0bcfe8b255e537", + "element_id": "d977fff4c69c437aa4a44a5c5f4bf02e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii" + "text": "Rank Order Laypersons" }, { "type": "Title", - "element_id": "d977fff4c69c437aa4a44a5c5f4bf02e", + "element_id": "4d7c9c95f808a09f6b0bcfe8b255e537", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Rank Order Laypersons" + "text": "Figure 1. Ordering of perceived risks for 30 activities and technologies1,iii" }, { "type": "UncategorizedText", - "element_id": "4523540f1504cd17100c4835e85b7eef", + "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "17" + "text": "30" }, { "type": "UncategorizedText", @@ -199,36 +199,36 @@ }, { "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", + "element_id": "4523540f1504cd17100c4835e85b7eef", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "30" + "text": "17" }, { "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "4" + "text": "" }, { "type": "UncategorizedText", @@ -243,36 +243,36 @@ }, { "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "2" + "text": "" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "1" + "text": "2" }, { "type": "UncategorizedText", @@ -287,80 +287,80 @@ }, { "type": "Title", - "element_id": "1656c455012b016fbac5eac0a38397bd", + "element_id": "eda8f72476c539920d2c0e3515ba4b07", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Electric power (non-nuclear)" + "text": "Smoking" }, { "type": "Title", - "element_id": "602d25f25cca4ebb709f8b48f54d99d9", + "element_id": "2f3122790ccc9e095abe1b5ceedddf88", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Motor vehicles" + "text": "X-rays" }, { "type": "Title", - "element_id": "eda8f72476c539920d2c0e3515ba4b07", + "element_id": "ed3861e631428b9b77e2bdc0384d2cbe", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Smoking" + "text": "Vaccinations" }, { "type": "Title", - "element_id": "2f3122790ccc9e095abe1b5ceedddf88", + "element_id": "602d25f25cca4ebb709f8b48f54d99d9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "X-rays" + "text": "Motor vehicles" }, { "type": "Title", - "element_id": "ed3861e631428b9b77e2bdc0384d2cbe", + "element_id": "82a60569029ed9032f1b08891e8524c2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Vaccinations" + "text": "Nuclear power" }, { "type": "Title", - "element_id": "82a60569029ed9032f1b08891e8524c2", + "element_id": "f8e3740e358309bd0570d4f3ca141793", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Nuclear power" + "text": "Handguns" }, { "type": "Title", - "element_id": "f8e3740e358309bd0570d4f3ca141793", + "element_id": "1656c455012b016fbac5eac0a38397bd", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "Handguns" + "text": "Electric power (non-nuclear)" }, { "type": "Title", @@ -408,25 +408,25 @@ }, { "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "7" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "2" + "text": "7" }, { "type": "UncategorizedText", @@ -441,36 +441,36 @@ }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "19581e27de7ced00ff1ce50b2047e7a5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "4" + "text": "9" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "1" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "19581e27de7ced00ff1ce50b2047e7a5", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4, "links": [] }, - "text": "9" + "text": "1" }, { "type": "UncategorizedText", @@ -672,36 +672,36 @@ }, { "type": "Title", - "element_id": "f83714d89302473e0e4f5399bd50e7a9", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "W T" + "text": "e" }, { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", + "type": "UncategorizedText", + "element_id": "e629fa6598d732768f7c726b4b621285", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "e" + "text": "15" }, { - "type": "UncategorizedText", - "element_id": "e629fa6598d732768f7c726b4b621285", + "type": "Title", + "element_id": "f83714d89302473e0e4f5399bd50e7a9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "15" + "text": "W T" }, { "type": "NarrativeText", @@ -715,26 +715,26 @@ "text": "r e p s e i t i l" }, { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", + "type": "UncategorizedText", + "element_id": "4a44dc15364204a80fe80e9039455cc1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "a t a F" + "text": "10" }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "links": [] }, - "text": "10" + "text": "a t a F" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 6bd7f4d877..4baf9be5a6 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -685,7 +685,7 @@ }, { "type": "Title", - "element_id": "007b2203e9e86a49c3108e9ffd16fbbc", + "element_id": "babfe67b3ecc6b32db9adb9da08274bf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -699,11 +699,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Euro area" + "text": "Jan. 22" }, { "type": "Title", - "element_id": "babfe67b3ecc6b32db9adb9da08274bf", + "element_id": "007b2203e9e86a49c3108e9ffd16fbbc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -717,7 +717,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Jan. 22" + "text": "Euro area" }, { "type": "Title", @@ -1279,7 +1279,7 @@ }, { "type": "ListItem", - "element_id": "afde979c99a73646915fe253c85c5a9c", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1293,11 +1293,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" + "text": "" }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "afde979c99a73646915fe253c85c5a9c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1311,7 +1311,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "" + "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, { "type": "UncategorizedText", @@ -1459,7 +1459,7 @@ }, { "type": "NarrativeText", - "element_id": "72d289ea524eebcd8f195a8afda1c223", + "element_id": "d24af8f44bd419665bb4ab6efef34fed", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1473,11 +1473,11 @@ "filetype": "application/pdf", "page_number": 6 }, - "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average." + "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies." }, { "type": "NarrativeText", - "element_id": "d24af8f44bd419665bb4ab6efef34fed", + "element_id": "72d289ea524eebcd8f195a8afda1c223", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1491,7 +1491,7 @@ "filetype": "application/pdf", "page_number": 6 }, - "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies." + "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average." }, { "type": "Title", @@ -1602,8 +1602,8 @@ "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)" }, { - "type": "Table", - "element_id": "63bdc79def2500227001ac95d78727ab", + "type": "Title", + "element_id": "d11a1c04bd3a9891350b4bd94104df58", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1617,7 +1617,43 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45," + "text": "Year over Year" + }, + { + "type": "Title", + "element_id": "aa22eb2e58c7cf45c528550d68e15c51", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "Difference from October 2022" + }, + { + "type": "Title", + "element_id": "8c327a62ae0e925498f5c68b819b32b4", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "Q4 over Q4 2/" }, { "type": "Title", @@ -1693,7 +1729,7 @@ }, { "type": "Title", - "element_id": "ad1094978303f5aa32665083ee1ed934", + "element_id": "b2800ff802361713acee893ebae272f6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1707,11 +1743,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Latin America and the Caribbean" + "text": "Saudi Arabia Sub-Saharan Africa" }, { "type": "Title", - "element_id": "24af2841400373443d80b6c91180918b", + "element_id": "6185fd66a4e106814e65c047c15dfb1f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1725,11 +1761,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Middle East and Central Asia" + "text": "Advanced Economies United States Euro Area" }, { "type": "Title", - "element_id": "b2800ff802361713acee893ebae272f6", + "element_id": "24af2841400373443d80b6c91180918b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1743,11 +1779,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Saudi Arabia Sub-Saharan Africa" + "text": "Middle East and Central Asia" }, { "type": "Title", - "element_id": "a4ca51cd6c74adf51f6e9ce60165d047", + "element_id": "7559320d044a32fbb21a7a8da25e9045", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1761,7 +1797,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Emerging Market and Developing Economies Emerging and Developing Asia" + "text": "Japan United Kingdom Canada Other Advanced Economies 3/" }, { "type": "Title", @@ -1783,7 +1819,7 @@ }, { "type": "Title", - "element_id": "6185fd66a4e106814e65c047c15dfb1f", + "element_id": "ad1094978303f5aa32665083ee1ed934", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1797,7 +1833,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Advanced Economies United States Euro Area" + "text": "Latin America and the Caribbean" }, { "type": "UncategorizedText", @@ -1819,7 +1855,7 @@ }, { "type": "Title", - "element_id": "7559320d044a32fbb21a7a8da25e9045", + "element_id": "a4ca51cd6c74adf51f6e9ce60165d047", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1833,7 +1869,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Japan United Kingdom Canada Other Advanced Economies 3/" + "text": "Emerging Market and Developing Economies Emerging and Developing Asia" }, { "type": "Title", @@ -1855,7 +1891,7 @@ }, { "type": "Title", - "element_id": "05704f84f4326b5f53a04d62f7ad62fc", + "element_id": "e30a554d7d1cbf308651f8c267ad6872", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1869,7 +1905,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Nigeria South Africa" + "text": "Brazil Mexico" }, { "type": "Title", @@ -1891,7 +1927,7 @@ }, { "type": "Title", - "element_id": "e30a554d7d1cbf308651f8c267ad6872", + "element_id": "18231df9f753f2eca887585247231761", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1905,11 +1941,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Brazil Mexico" + "text": "Germany France Italy Spain" }, { "type": "Title", - "element_id": "18231df9f753f2eca887585247231761", + "element_id": "05704f84f4326b5f53a04d62f7ad62fc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1923,7 +1959,25 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Germany France Italy Spain" + "text": "Nigeria South Africa" + }, + { + "type": "Table", + "element_id": "63bdc79def2500227001ac95d78727ab", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45," }, { "type": "UncategorizedText", @@ -1981,7 +2035,7 @@ }, { "type": "UncategorizedText", - "element_id": "9db439c530ed3425c0a68724de199942", + "element_id": "a7143daa9de8af6e0c465ca1354d45b6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1995,11 +2049,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.7 3.1 5.9" + "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9" }, { "type": "UncategorizedText", - "element_id": "2a9680555d457b6da4b6748492bb6f3d", + "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2013,11 +2067,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3" + "text": "6.2" }, { "type": "UncategorizedText", - "element_id": "a7143daa9de8af6e0c465ca1354d45b6", + "element_id": "2a9680555d457b6da4b6748492bb6f3d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2031,11 +2085,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9" + "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3" }, { "type": "UncategorizedText", - "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f", + "element_id": "dbc6d298b0672b8176de90a623844b7f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2049,11 +2103,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.2" + "text": "6.0 5.5 3.8 4.1 7.0 4.1" }, { "type": "UncategorizedText", - "element_id": "dbc6d298b0672b8176de90a623844b7f", + "element_id": "9db439c530ed3425c0a68724de199942", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2067,7 +2121,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.0 5.5 3.8 4.1 7.0 4.1" + "text": "4.7 3.1 5.9" }, { "type": "Title", @@ -2125,7 +2179,7 @@ }, { "type": "UncategorizedText", - "element_id": "6976f35f9f91b539b46743f37d94014a", + "element_id": "b7948d6976e997e76e343161b4b5d864", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2139,11 +2193,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8" + "text": "8.8 7.3 9.9" }, { "type": "UncategorizedText", - "element_id": "743f3bc42f087068035515a8dec4f85a", + "element_id": "72d73db944cf6d9a5f11d6c073c1dce0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2157,11 +2211,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.1 3.7 5.2 5.4 3.8 4.9" + "text": "3.4" }, { "type": "UncategorizedText", - "element_id": "72d73db944cf6d9a5f11d6c073c1dce0", + "element_id": "e352203d837b1096ee96e1977f1c3d0b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2175,11 +2229,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.4" + "text": "5.4 6.6 3.4" }, { "type": "UncategorizedText", - "element_id": "b7948d6976e997e76e343161b4b5d864", + "element_id": "743f3bc42f087068035515a8dec4f85a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2193,11 +2247,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "8.8 7.3 9.9" + "text": "3.1 3.7 5.2 5.4 3.8 4.9" }, { "type": "UncategorizedText", - "element_id": "e352203d837b1096ee96e1977f1c3d0b", + "element_id": "6976f35f9f91b539b46743f37d94014a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2211,7 +2265,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "5.4 6.6 3.4" + "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8" }, { "type": "UncategorizedText", @@ -2267,24 +2321,6 @@ }, "text": "1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0" }, - { - "type": "UncategorizedText", - "element_id": "e7ac421147471fe341ae242e7544a44c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "6.6 4.6 8.1" - }, { "type": "UncategorizedText", "element_id": "1ea8f3c3db2cb6c75f21ebf26acc28a5", @@ -2305,7 +2341,7 @@ }, { "type": "UncategorizedText", - "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab", + "element_id": "098d858ff74b2740723330ff6e43edf8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2319,11 +2355,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.4 0.7 4.3 3.2 4.0 4.9" + "text": "2.4 2.3 2.6" }, { "type": "UncategorizedText", - "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4", + "element_id": "e7ac421147471fe341ae242e7544a44c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2337,11 +2373,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.9" + "text": "6.6 4.6 8.1" }, { "type": "UncategorizedText", - "element_id": "098d858ff74b2740723330ff6e43edf8", + "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2355,11 +2391,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.4 2.3 2.6" + "text": "2.4 0.7 4.3 3.2 4.0 4.9" }, { - "type": "Title", - "element_id": "d11a1c04bd3a9891350b4bd94104df58", + "type": "UncategorizedText", + "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2373,7 +2409,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Year over Year" + "text": "2.9" }, { "type": "UncategorizedText", @@ -2413,25 +2449,7 @@ }, { "type": "UncategorizedText", - "element_id": "123157612cd26d61b4760a5ecd1f4bfc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "2.5 1.8 4.7 3.5 4.1 5.6" - }, - { - "type": "UncategorizedText", - "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387", + "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2445,11 +2463,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3" + "text": "4.3 2.6 5.5" }, { "type": "UncategorizedText", - "element_id": "7fdc64e781146808df57eac112860f9b", + "element_id": "777e0063772d428bf1c04383b8ad058e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2463,7 +2481,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.4 2.7 4.6" + "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4" }, { "type": "UncategorizedText", @@ -2485,7 +2503,7 @@ }, { "type": "UncategorizedText", - "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed", + "element_id": "123157612cd26d61b4760a5ecd1f4bfc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2499,11 +2517,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.3 2.6 5.5" + "text": "2.5 1.8 4.7 3.5 4.1 5.6" }, { "type": "UncategorizedText", - "element_id": "777e0063772d428bf1c04383b8ad058e", + "element_id": "7fdc64e781146808df57eac112860f9b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2517,11 +2535,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4" + "text": "3.4 2.7 4.6" }, { - "type": "Title", - "element_id": "aa22eb2e58c7cf45c528550d68e15c51", + "type": "UncategorizedText", + "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2535,7 +2553,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Difference from October 2022" + "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3" }, { "type": "Title", @@ -2575,7 +2593,7 @@ }, { "type": "UncategorizedText", - "element_id": "effb80722a72ecff482b7a0d4a027e78", + "element_id": "84bc47d0d0703878a250620230630525", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2589,11 +2607,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.3 0.0 –0.2 –0.4 0.4 0.0" + "text": "–3.3 –0.1" }, { "type": "UncategorizedText", - "element_id": "d35a737537febb07f01925c873444cbc", + "element_id": "effb80722a72ecff482b7a0d4a027e78", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2607,7 +2625,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1 0.0 –0.3" + "text": "0.3 0.0 –0.2 –0.4 0.4 0.0" }, { "type": "UncategorizedText", @@ -2629,7 +2647,7 @@ }, { "type": "UncategorizedText", - "element_id": "84bc47d0d0703878a250620230630525", + "element_id": "d35a737537febb07f01925c873444cbc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2643,7 +2661,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–3.3 –0.1" + "text": "–0.1 0.0 –0.3" }, { "type": "UncategorizedText", @@ -2719,7 +2737,7 @@ }, { "type": "UncategorizedText", - "element_id": "4d702c47ea48fa0dca98ce691995cc1b", + "element_id": "037023840d334f9f357a6c3da2b058ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2733,7 +2751,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0" + "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1" }, { "type": "UncategorizedText", @@ -2755,7 +2773,7 @@ }, { "type": "UncategorizedText", - "element_id": "037023840d334f9f357a6c3da2b058ff", + "element_id": "ebb1568088af8b7c7b98878b895decaf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2769,7 +2787,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1" + "text": "–0.9 0.3" }, { "type": "UncategorizedText", @@ -2791,7 +2809,7 @@ }, { "type": "UncategorizedText", - "element_id": "ebb1568088af8b7c7b98878b895decaf", + "element_id": "4d702c47ea48fa0dca98ce691995cc1b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2805,7 +2823,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.9 0.3" + "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0" }, { "type": "UncategorizedText", @@ -2879,24 +2897,6 @@ }, "text": "9.2 7.8 10.4" }, - { - "type": "UncategorizedText", - "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0" - }, { "type": "UncategorizedText", "element_id": "3d5c2c97e00e0c5be2a870cf1cbaac06", @@ -2917,7 +2917,7 @@ }, { "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", + "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2931,11 +2931,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": ". . . . . . . . ." + "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0" }, { "type": "UncategorizedText", - "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8", + "element_id": "eca06fdd26e513a7b8510c8660228504", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2949,11 +2949,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4" + "text": "1.9" }, { "type": "UncategorizedText", - "element_id": "eca06fdd26e513a7b8510c8660228504", + "element_id": "4d5d14d8c932363fe84036564c6c582b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2967,11 +2967,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.9" + "text": "1.7 1.8 3.7 . . . 2.5 . . ." }, { "type": "UncategorizedText", - "element_id": "4d5d14d8c932363fe84036564c6c582b", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2985,11 +2985,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.7 1.8 3.7 . . . 2.5 . . ." + "text": ". . . . . . . . ." }, { - "type": "Title", - "element_id": "8c327a62ae0e925498f5c68b819b32b4", + "type": "UncategorizedText", + "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3003,7 +3003,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Q4 over Q4 2/" + "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4" }, { "type": "Title", @@ -3097,7 +3097,7 @@ }, { "type": "UncategorizedText", - "element_id": "3135d2d71bff77be4838a7102bbac5b8", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3111,11 +3111,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.2" + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", + "element_id": "3135d2d71bff77be4838a7102bbac5b8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3129,7 +3129,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": ". . . . . . . . ." + "text": "3.2" }, { "type": "UncategorizedText", @@ -3187,7 +3187,7 @@ }, { "type": "UncategorizedText", - "element_id": "07adb8acdd66b5d2490e542ae0604b71", + "element_id": "39b99440eae2f9ee75cf98100c285787", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3201,11 +3201,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" + "text": "2.5 2.0 4.0 . . . 4.1 . . ." }, { "type": "UncategorizedText", - "element_id": "39b99440eae2f9ee75cf98100c285787", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3219,7 +3219,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.5 2.0 4.0 . . . 4.1 . . ." + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", @@ -3241,7 +3241,7 @@ }, { "type": "UncategorizedText", - "element_id": "a416ea84421fa7e1351582da48235bac", + "element_id": "1776cf91dccdf2cce268fcee416b28f6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3255,11 +3255,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.0" + "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" }, { "type": "UncategorizedText", - "element_id": "1776cf91dccdf2cce268fcee416b28f6", + "element_id": "07adb8acdd66b5d2490e542ae0604b71", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3273,11 +3273,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" + "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" }, { "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", + "element_id": "a416ea84421fa7e1351582da48235bac", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3291,11 +3291,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": ". . . . . . . . ." + "text": "3.0" }, { "type": "NarrativeText", - "element_id": "df59a495ef85c5f70c5ba5356caf764a", + "element_id": "dd295fca8aff81058c48312a022b69b2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3309,11 +3309,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:" + "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024." }, { "type": "NarrativeText", - "element_id": "dd295fca8aff81058c48312a022b69b2", + "element_id": "df59a495ef85c5f70c5ba5356caf764a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3327,7 +3327,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024." + "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:" }, { "type": "ListItem", @@ -3510,8 +3510,8 @@ "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital" }, { - "type": "ListItem", - "element_id": "42ac57e394bf7c98d908745cefce0b80", + "type": "NarrativeText", + "element_id": "1bbcee85386321e6e8235a64d4c34d73", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3525,11 +3525,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of" + "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." }, { - "type": "NarrativeText", - "element_id": "1bbcee85386321e6e8235a64d4c34d73", + "type": "ListItem", + "element_id": "42ac57e394bf7c98d908745cefce0b80", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3543,7 +3543,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." + "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of" }, { "type": "NarrativeText", @@ -3565,7 +3565,7 @@ }, { "type": "ListItem", - "element_id": "2d14934d52ff357c52e9ae1c38f7390e", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3579,11 +3579,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." + "text": "" }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "2d14934d52ff357c52e9ae1c38f7390e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3597,7 +3597,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "" + "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." }, { "type": "ListItem", @@ -3960,8 +3960,8 @@ "text": "Strengthening multilateral cooperation—Urgent action is needed to limit the risks stemming from geopolitical fragmentation and to ensure cooperation on fundamental areas of common interest:" }, { - "type": "NarrativeText", - "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b", + "type": "ListItem", + "element_id": "bd7674df887463bc9f05c8030a151dea", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3975,11 +3975,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." + "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global" }, { - "type": "ListItem", - "element_id": "bd7674df887463bc9f05c8030a151dea", + "type": "NarrativeText", + "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3993,7 +3993,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global" + "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." }, { "type": "ListItem", @@ -4014,8 +4014,8 @@ "text": "Strengthening global trade: Strengthening the global trading system would address risks associated" }, { - "type": "NarrativeText", - "element_id": "e6f343736720ae4f9bf5202294c7c9fc", + "type": "Title", + "element_id": "0695b563acde461fc2f8d9aebccf35c7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4029,11 +4029,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." + "text": "with" }, { - "type": "Title", - "element_id": "0695b563acde461fc2f8d9aebccf35c7", + "type": "NarrativeText", + "element_id": "e6f343736720ae4f9bf5202294c7c9fc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4047,7 +4047,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "with" + "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." }, { "type": "ListItem", @@ -4194,8 +4194,8 @@ "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." }, { - "type": "NarrativeText", - "element_id": "261bebc8fb9b3ed5146d23644639bc26", + "type": "UncategorizedText", + "element_id": "a43f5d32a34c9b54fe96097c3d491389", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4209,11 +4209,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." + "text": "–3" }, { - "type": "NarrativeText", - "element_id": "e118be83abfed92b8969eca98bb4d53b", + "type": "UncategorizedText", + "element_id": "28a5aa3897d66de6c31caba99a4c337e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4227,11 +4227,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." + "text": "–2" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "467792e5d9b6bec26f556875e9ccab10", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4245,11 +4245,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "–1" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4263,11 +4263,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4281,11 +4281,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4299,11 +4299,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "3" }, { "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4317,11 +4317,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "0" }, { "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4335,11 +4335,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "5" }, { - "type": "Title", - "element_id": "6ef230728534d871e5126e2a55e12b26", + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4353,11 +4353,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "6" }, { - "type": "Title", - "element_id": "57de33ba9eaa9e5980d4cf6da83abf46", + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4371,11 +4371,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "467792e5d9b6bec26f556875e9ccab10", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4389,11 +4389,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "–1" + "text": "7" }, { "type": "UncategorizedText", - "element_id": "28a5aa3897d66de6c31caba99a4c337e", + "element_id": "4108466a9a52ce87e39eb1836a42f6f2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4407,11 +4407,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "–2" + "text": "2006 08 08" }, { - "type": "UncategorizedText", - "element_id": "a43f5d32a34c9b54fe96097c3d491389", + "type": "Title", + "element_id": "57de33ba9eaa9e5980d4cf6da83abf46", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4425,7 +4425,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "–3" + "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" }, { "type": "NarrativeText", @@ -4446,8 +4446,8 @@ "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "UncategorizedText", + "element_id": "aacd834b5cdc64a329e27649143406dd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4461,11 +4461,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "06" }, { "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "785329d8f1c63e8d0cdeedba9e6bc2ea", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4479,11 +4479,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "7" + "text": "10 10" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "1e46bf7c5134da75e3a2aae852d7bddf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4497,11 +4497,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "12 12" }, { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "type": "Title", + "element_id": "4255f2d53f6408c450b02b249d53c220", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4515,11 +4515,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "United States Euro area China Other AEs Other EMs" }, { "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "c81a1234a265c680bbc9e96e73073acd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4533,11 +4533,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "14 16 14" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "b17ef6d19c7a5b1ee83b907c595526dc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4551,11 +4551,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "16" }, { "type": "UncategorizedText", - "element_id": "4108466a9a52ce87e39eb1836a42f6f2", + "element_id": "99cb7a0185216a0acb0ed918e7058868", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4569,11 +4569,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2006 08 08" + "text": "18 18" }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "0c5e98c11d7bb005adbaf731ebfbbb2c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4587,11 +4587,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "0" + "text": "20 22 22" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4605,11 +4605,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "20" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "53d79cec96694df67ce3baff95d8a2e3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4623,11 +4623,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "October 2022 GFSR" }, { - "type": "UncategorizedText", - "element_id": "aacd834b5cdc64a329e27649143406dd", + "type": "NarrativeText", + "element_id": "e118be83abfed92b8969eca98bb4d53b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4641,11 +4641,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "06" + "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { - "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "type": "NarrativeText", + "element_id": "261bebc8fb9b3ed5146d23644639bc26", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4659,11 +4659,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, { - "type": "ListItem", - "element_id": "7d4f55875c970d850a152ba1d5ba02a5", + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4677,11 +4677,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1. United States" + "text": "3" }, { - "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4695,11 +4695,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "1" }, { - "type": "Title", - "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4713,11 +4713,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Latest" + "text": "6" }, { "type": "UncategorizedText", - "element_id": "785329d8f1c63e8d0cdeedba9e6bc2ea", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4731,11 +4731,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "10 10" + "text": "2" }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4749,11 +4749,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "1e46bf7c5134da75e3a2aae852d7bddf", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4767,11 +4767,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "12 12" + "text": "5" }, { "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4785,11 +4785,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "Oct. 22" }, { "type": "Title", - "element_id": "4255f2d53f6408c450b02b249d53c220", + "element_id": "6ef230728534d871e5126e2a55e12b26", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4803,11 +4803,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "United States Euro area China Other AEs Other EMs" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { - "type": "UncategorizedText", - "element_id": "c81a1234a265c680bbc9e96e73073acd", + "type": "Title", + "element_id": "24a234895630131d612fc1b4605a256e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4821,11 +4821,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "14 16 14" + "text": "Apr. 23" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "ListItem", + "element_id": "7d4f55875c970d850a152ba1d5ba02a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4839,11 +4839,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "1. United States" }, { "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4857,11 +4857,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "Oct. 23" }, { - "type": "ListItem", - "element_id": "8e655408cf212df5f74df13e05cdf02c", + "type": "Title", + "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4875,11 +4875,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2. Euro area" + "text": "Latest" }, { - "type": "UncategorizedText", - "element_id": "b17ef6d19c7a5b1ee83b907c595526dc", + "type": "Title", + "element_id": "d8478f45b9790d52201238244d0e9698", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4893,11 +4893,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "16" + "text": "Dec. 24" }, { "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4911,11 +4911,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Dec. 26" }, { - "type": "UncategorizedText", - "element_id": "99cb7a0185216a0acb0ed918e7058868", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4929,11 +4929,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "18 18" + "text": "Oct. 22" }, { "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "element_id": "53d79cec96694df67ce3baff95d8a2e3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4947,11 +4947,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "October 2022 GFSR" }, { - "type": "UncategorizedText", - "element_id": "0c5e98c11d7bb005adbaf731ebfbbb2c", + "type": "ListItem", + "element_id": "8e655408cf212df5f74df13e05cdf02c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4965,11 +4965,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "20 22 22" + "text": "2. Euro area" }, { "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "element_id": "24a234895630131d612fc1b4605a256e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4983,11 +4983,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "Apr. 23" }, { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "type": "Title", + "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5001,11 +5001,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "20" + "text": "Oct. 23" }, { "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "element_id": "d8478f45b9790d52201238244d0e9698", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5019,7 +5019,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "Dec. 24" }, { "type": "Title", @@ -5041,7 +5041,7 @@ }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5055,11 +5055,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5073,11 +5073,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5091,11 +5091,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "3" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5109,11 +5109,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5127,7 +5127,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "1" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index b5153e745f..ef0bcde4d8 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -199,7 +199,7 @@ }, { "type": "ListItem", - "element_id": "3cc3e847449fed4fa13bbd94f86e43a9", + "element_id": "9c4387f669c689e9af0a712fd494b2d7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -213,11 +213,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The need to create a level playing field that values reliability and energy security" + "text": "The need for harmony in the nuclear regulatory environment" }, { "type": "ListItem", - "element_id": "9c4387f669c689e9af0a712fd494b2d7", + "element_id": "93e7dedc9d334470067ad2de1f9ee788", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -231,11 +231,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The need for harmony in the nuclear regulatory environment" + "text": "The need for a holistic safety paradigm for the whole electricity system." }, { "type": "ListItem", - "element_id": "93e7dedc9d334470067ad2de1f9ee788", + "element_id": "3cc3e847449fed4fa13bbd94f86e43a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -249,7 +249,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The need for a holistic safety paradigm for the whole electricity system." + "text": "The need to create a level playing field that values reliability and energy security" }, { "type": "UncategorizedText", @@ -342,8 +342,8 @@ "text": " Marine" }, { - "type": "Title", - "element_id": "563a2980d46c81119e1d7d952b375a41", + "type": "UncategorizedText", + "element_id": "9925953f1faef050547e5f7b811c3f7d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -357,11 +357,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "h W T" + "text": "40,000" }, { - "type": "UncategorizedText", - "element_id": "9925953f1faef050547e5f7b811c3f7d", + "type": "Title", + "element_id": "d04999bf99ea28fc8a6b20318caac58c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -375,11 +375,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "40,000" + "text": " CSP" }, { - "type": "UncategorizedText", - "element_id": "4ebe55cc1aee6dd892d7182d797d105a", + "type": "Title", + "element_id": "563a2980d46c81119e1d7d952b375a41", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -393,11 +393,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "35,000" + "text": "h W T" }, { "type": "UncategorizedText", - "element_id": "422f240e43a3226f329ba4a0236f587c", + "element_id": "4ebe55cc1aee6dd892d7182d797d105a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -411,11 +411,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000" + "text": "35,000" }, { "type": "UncategorizedText", - "element_id": "c7e6673590d2426f635c9be70bd8f057", + "element_id": "422f240e43a3226f329ba4a0236f587c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -429,11 +429,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "25,000" + "text": "30,000" }, { "type": "UncategorizedText", - "element_id": "b6b53b7d4224992f9aa86411bbc3f74b", + "element_id": "c7e6673590d2426f635c9be70bd8f057", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -447,11 +447,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "20,000" + "text": "25,000" }, { "type": "UncategorizedText", - "element_id": "b2ee3509c1fa4f9741f894e592bda9ac", + "element_id": "b6b53b7d4224992f9aa86411bbc3f74b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -465,11 +465,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "15,000" + "text": "20,000" }, { "type": "UncategorizedText", - "element_id": "28ec039832f5bc96c2be0eaee016dafe", + "element_id": "b2ee3509c1fa4f9741f894e592bda9ac", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -483,11 +483,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "10,000" + "text": "15,000" }, { "type": "UncategorizedText", - "element_id": "b2008c37ee3a7cf7ba87f5ad50dd9e11", + "element_id": "28ec039832f5bc96c2be0eaee016dafe", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -501,11 +501,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "5,000" + "text": "10,000" }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "b2008c37ee3a7cf7ba87f5ad50dd9e11", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -519,7 +519,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "0" + "text": "5,000" }, { "type": "Title", @@ -648,8 +648,8 @@ "text": "__" }, { - "type": "UncategorizedText", - "element_id": "81a83544cf93c245178cbc1620030f11", + "type": "Title", + "element_id": "8af26217282646d0f64d3e3211f47512", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -663,11 +663,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2000" + "text": " Solar PV" }, { - "type": "UncategorizedText", - "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", + "type": "Title", + "element_id": "6e28663850f2b50ee6af2d4477b410be", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -681,11 +681,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2010" + "text": " Geothermal" }, { - "type": "UncategorizedText", - "element_id": "73a2af8864fc500fa49048bf3003776c", + "type": "Title", + "element_id": "7e2f430d44cfb03dca12ffde615c36ec", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -699,11 +699,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2020" + "text": " Wind" }, { - "type": "UncategorizedText", - "element_id": "8e1f192fe25ad49be764c3f55c68beb3", + "type": "Title", + "element_id": "bde9df80639b681edb85ace46b4d4600", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -717,11 +717,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2030" + "text": " Bioenergy" }, { - "type": "UncategorizedText", - "element_id": "df34d853f2f2f1f14b92359f695426dc", + "type": "Title", + "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -735,11 +735,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2040" + "text": " Hydro" }, { "type": "Title", - "element_id": "d04999bf99ea28fc8a6b20318caac58c", + "element_id": "f35457739b3bd74c61625c986c844726", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -753,11 +753,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " CSP" + "text": " Nuclear" }, { "type": "Title", - "element_id": "8af26217282646d0f64d3e3211f47512", + "element_id": "0f3341ae76e0d4d7816d3620bd915110", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -771,11 +771,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Solar PV" + "text": " Gas" }, { "type": "Title", - "element_id": "6e28663850f2b50ee6af2d4477b410be", + "element_id": "b001a2374d44e3085e712bb40f66270e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -789,11 +789,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Geothermal" + "text": " Oil" }, { "type": "Title", - "element_id": "7e2f430d44cfb03dca12ffde615c36ec", + "element_id": "90ad0c8c14253135efd14645e0156145", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -807,11 +807,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Wind" + "text": " Coal" }, { - "type": "Title", - "element_id": "bde9df80639b681edb85ace46b4d4600", + "type": "UncategorizedText", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -825,11 +825,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Bioenergy" + "text": "0" }, { - "type": "Title", - "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", + "type": "UncategorizedText", + "element_id": "81a83544cf93c245178cbc1620030f11", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -843,11 +843,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Hydro" + "text": "2000" }, { - "type": "Title", - "element_id": "f35457739b3bd74c61625c986c844726", + "type": "UncategorizedText", + "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -861,11 +861,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Nuclear" + "text": "2010" }, { - "type": "Title", - "element_id": "0f3341ae76e0d4d7816d3620bd915110", + "type": "UncategorizedText", + "element_id": "73a2af8864fc500fa49048bf3003776c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -879,11 +879,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Gas" + "text": "2020" }, { - "type": "Title", - "element_id": "b001a2374d44e3085e712bb40f66270e", + "type": "UncategorizedText", + "element_id": "8e1f192fe25ad49be764c3f55c68beb3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -897,11 +897,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Oil" + "text": "2030" }, { - "type": "Title", - "element_id": "90ad0c8c14253135efd14645e0156145", + "type": "UncategorizedText", + "element_id": "df34d853f2f2f1f14b92359f695426dc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -915,7 +915,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Coal" + "text": "2040" }, { "type": "NarrativeText", @@ -972,8 +972,8 @@ "text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear – instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall." }, { - "type": "Title", - "element_id": "a5d60fc4dbbd484074d8389c35703cf7", + "type": "UncategorizedText", + "element_id": "ebc18f485dc347b842b3d248d011ce6c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -987,11 +987,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "h W G" + "text": "30,000,000" }, { - "type": "UncategorizedText", - "element_id": "ebc18f485dc347b842b3d248d011ce6c", + "type": "Title", + "element_id": "a5d60fc4dbbd484074d8389c35703cf7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1005,7 +1005,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000,000" + "text": "h W G" }, { "type": "UncategorizedText", @@ -1098,8 +1098,8 @@ "text": "5,000,000" }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "Title", + "element_id": "e3cf3e34001852adb7a17cf424bda9fc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1113,11 +1113,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "0" + "text": " High-carbon  Low-carbon" }, { - "type": "Title", - "element_id": "e3cf3e34001852adb7a17cf424bda9fc", + "type": "UncategorizedText", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1131,7 +1131,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " High-carbon  Low-carbon" + "text": "0" }, { "type": "UncategorizedText", @@ -1908,8 +1908,8 @@ "text": "140" }, { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", + "type": "NarrativeText", + "element_id": "e11247712b3df61756970b45f019ad68", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1923,11 +1923,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "a t a F" + "text": "r a e y" }, { - "type": "NarrativeText", - "element_id": "e11247712b3df61756970b45f019ad68", + "type": "Title", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1941,11 +1941,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "r a e y" + "text": "e" }, { - "type": "Title", - "element_id": "f83714d89302473e0e4f5399bd50e7a9", + "type": "UncategorizedText", + "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1959,11 +1959,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "W T" + "text": "120" }, { - "type": "NarrativeText", - "element_id": "f9bb49945b60897227abdd75b5f8d39b", + "type": "UncategorizedText", + "element_id": "ad57366865126e55649ecb23ae1d4888", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1977,11 +1977,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "r e p s e i t i l" + "text": "100" }, { "type": "UncategorizedText", - "element_id": "380918b946a526640a40df5dced65167", + "element_id": "5bddd069fd77ec5699d9ab00c00f47c4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1995,11 +1995,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "=" + "text": "1 :" }, { "type": "UncategorizedText", - "element_id": "911bc18af1665a604b4fa4a97d47f477", + "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2013,11 +2013,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "“99 :" + "text": "120" }, { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", + "type": "UncategorizedText", + "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2031,11 +2031,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "e" + "text": ":" }, { "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "element_id": "b725d20650649a5221675144bab5946e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2049,11 +2049,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "120" + "text": "99.5" }, { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", + "type": "Title", + "element_id": "f83714d89302473e0e4f5399bd50e7a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2067,11 +2067,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "100" + "text": "W T" }, { "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", + "element_id": "380918b946a526640a40df5dced65167", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2085,11 +2085,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "40" + "text": "=" }, { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", + "type": "NarrativeText", + "element_id": "f9bb49945b60897227abdd75b5f8d39b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2103,11 +2103,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "60" + "text": "r e p s e i t i l" }, { "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2121,11 +2121,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "20" + "text": "80" }, { "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", + "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2139,11 +2139,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "80" + "text": "60" }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "ce3201efc2e495241a85e4fc84575f50", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2157,11 +2157,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "0" + "text": "71.9" }, { "type": "UncategorizedText", - "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2175,11 +2175,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": ":" + "text": "1" }, { - "type": "UncategorizedText", - "element_id": "5bddd069fd77ec5699d9ab00c00f47c4", + "type": "Title", + "element_id": "1b16b1df538ba12dc3f97edbb85caa70", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2193,11 +2193,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "1 :" + "text": "n" }, { - "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", + "type": "UncategorizedText", + "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2211,11 +2211,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "C oal" + "text": "." }, { "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2229,11 +2229,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "120" + "text": "1" }, { - "type": "UncategorizedText", - "element_id": "b725d20650649a5221675144bab5946e", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2247,11 +2247,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "99.5" + "text": "a t a F" }, { - "type": "Title", - "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", + "type": "UncategorizedText", + "element_id": "d59eced1ded07f84c145592f65bdf854", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2265,11 +2265,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Oil" + "text": "40" }, { - "type": "Title", - "element_id": "4fabb98454d019811a732c4a09f31bf0", + "type": "UncategorizedText", + "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2283,11 +2283,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "N atural gas" + "text": ":" }, { "type": "UncategorizedText", - "element_id": "ce3201efc2e495241a85e4fc84575f50", + "element_id": "911bc18af1665a604b4fa4a97d47f477", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2301,11 +2301,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "71.9" + "text": "“99 :" }, { - "type": "Title", - "element_id": "593cbe414f10662e62c0da03ce3302b8", + "type": "UncategorizedText", + "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2319,11 +2319,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "fe)" + "text": "20" }, { - "type": "Title", - "element_id": "77cf83b127020f3a465005abc747e63f", + "type": "UncategorizedText", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2337,11 +2337,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Offshore wind" + "text": "0" }, { - "type": "UncategorizedText", - "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", + "type": "Title", + "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2355,11 +2355,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": ":" + "text": "C oal" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2373,11 +2373,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "1" + "text": "Oil" }, { "type": "Title", - "element_id": "1b16b1df538ba12dc3f97edbb85caa70", + "element_id": "4fabb98454d019811a732c4a09f31bf0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2391,11 +2391,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "n" + "text": "N atural gas" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "593cbe414f10662e62c0da03ce3302b8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2409,11 +2409,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "1" + "text": "fe)" }, { - "type": "UncategorizedText", - "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", + "type": "Title", + "element_id": "77cf83b127020f3a465005abc747e63f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2427,7 +2427,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "." + "text": "Offshore wind" }, { "type": "UncategorizedText", @@ -2646,8 +2646,8 @@ "text": "100" }, { - "type": "UncategorizedText", - "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", + "type": "Title", + "element_id": "90ad0c8c14253135efd14645e0156145", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2661,11 +2661,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "90" + "text": " Coal" }, { - "type": "Title", - "element_id": "90ad0c8c14253135efd14645e0156145", + "type": "UncategorizedText", + "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2679,7 +2679,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": " Coal" + "text": "90" }, { "type": "Title", @@ -3187,7 +3187,7 @@ }, { "type": "Title", - "element_id": "563a2980d46c81119e1d7d952b375a41", + "element_id": "f6e172956a9472fa43f9a895f99c2836", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3201,11 +3201,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "h W T" + "text": " Natural gas" }, { - "type": "UncategorizedText", - "element_id": "983bd614bb5afece5ab3b6023f71147c", + "type": "Title", + "element_id": "563a2980d46c81119e1d7d952b375a41", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3219,11 +3219,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "300" + "text": "h W T" }, { "type": "UncategorizedText", - "element_id": "27badc983df1780b60c2b3fa9d3a19a0", + "element_id": "983bd614bb5afece5ab3b6023f71147c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3237,11 +3237,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "200" + "text": "300" }, { "type": "UncategorizedText", - "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9", + "element_id": "27badc983df1780b60c2b3fa9d3a19a0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3255,11 +3255,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "y ——" + "text": "200" }, { - "type": "ListItem", - "element_id": "bda050585a00f0f6cb502350559d7553", + "type": "UncategorizedText", + "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3273,7 +3273,7 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "—" + "text": "y ——" }, { "type": "ListItem", @@ -3293,24 +3293,6 @@ }, "text": "—" }, - { - "type": "Title", - "element_id": "f6e172956a9472fa43f9a895f99c2836", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": " Natural gas" - }, { "type": "Title", "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", @@ -3474,8 +3456,8 @@ "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x" }, { - "type": "FigureCaption", - "element_id": "eeda9f9210dfe4be7e82b4385290d3ca", + "type": "NarrativeText", + "element_id": "4f5cc927b953f3c49c562a22c88f863f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3489,11 +3471,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "One fuel pellet contains as much energy as a tonne of coal" + "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." }, { - "type": "NarrativeText", - "element_id": "4f5cc927b953f3c49c562a22c88f863f", + "type": "FigureCaption", + "element_id": "eeda9f9210dfe4be7e82b4385290d3ca", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3507,7 +3489,7 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." + "text": "One fuel pellet contains as much energy as a tonne of coal" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index a2f18f4d0d..a0b176312b 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -307,7 +307,7 @@ }, { "type": "UncategorizedText", - "element_id": "4523540f1504cd17100c4835e85b7eef", + "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -321,7 +321,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "17" + "text": "30" }, { "type": "UncategorizedText", @@ -343,7 +343,7 @@ }, { "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", + "element_id": "4523540f1504cd17100c4835e85b7eef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -357,7 +357,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30" + "text": "17" }, { "type": "UncategorizedText", @@ -415,7 +415,7 @@ }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -429,11 +429,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -447,11 +447,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "3" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -465,7 +465,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "1" + "text": "3" }, { "type": "UncategorizedText", @@ -505,7 +505,7 @@ }, { "type": "Title", - "element_id": "1656c455012b016fbac5eac0a38397bd", + "element_id": "eda8f72476c539920d2c0e3515ba4b07", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -519,11 +519,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Electric power (non-nuclear)" + "text": "Smoking" }, { "type": "Title", - "element_id": "ed3861e631428b9b77e2bdc0384d2cbe", + "element_id": "f8e3740e358309bd0570d4f3ca141793", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -537,11 +537,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Vaccinations" + "text": "Handguns" }, { "type": "Title", - "element_id": "eda8f72476c539920d2c0e3515ba4b07", + "element_id": "ed3861e631428b9b77e2bdc0384d2cbe", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -555,11 +555,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Smoking" + "text": "Vaccinations" }, { "type": "Title", - "element_id": "f8e3740e358309bd0570d4f3ca141793", + "element_id": "602d25f25cca4ebb709f8b48f54d99d9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -573,11 +573,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Handguns" + "text": "Motor vehicles" }, { "type": "Title", - "element_id": "602d25f25cca4ebb709f8b48f54d99d9", + "element_id": "82a60569029ed9032f1b08891e8524c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -591,11 +591,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Motor vehicles" + "text": "Nuclear power" }, { "type": "Title", - "element_id": "82a60569029ed9032f1b08891e8524c2", + "element_id": "1656c455012b016fbac5eac0a38397bd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -609,7 +609,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Nuclear power" + "text": "Electric power (non-nuclear)" }, { "type": "Title", @@ -703,7 +703,7 @@ }, { "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -717,11 +717,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "19581e27de7ced00ff1ce50b2047e7a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -735,11 +735,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "4" + "text": "9" }, { "type": "UncategorizedText", - "element_id": "19581e27de7ced00ff1ce50b2047e7a5", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -753,11 +753,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "9" + "text": "1" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -771,11 +771,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "1" + "text": "7" }, { "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -789,11 +789,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "7" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -807,7 +807,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "" }, { "type": "NarrativeText", @@ -1116,8 +1116,8 @@ "text": "r a e y" }, { - "type": "UncategorizedText", - "element_id": "dca468ba69cda6650ce03d976c274c66", + "type": "Title", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1131,11 +1131,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "S15" + "text": "e" }, { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", + "type": "UncategorizedText", + "element_id": "e629fa6598d732768f7c726b4b621285", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1149,11 +1149,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "e" + "text": "15" }, { "type": "UncategorizedText", - "element_id": "e629fa6598d732768f7c726b4b621285", + "element_id": "dca468ba69cda6650ce03d976c274c66", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1167,7 +1167,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "15" + "text": "S15" }, { "type": "Title", @@ -1206,8 +1206,8 @@ "text": "r e p s e i t i l" }, { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", + "type": "UncategorizedText", + "element_id": "4a44dc15364204a80fe80e9039455cc1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1221,11 +1221,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "a t a F" + "text": "10" }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1239,7 +1239,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "10" + "text": "a t a F" }, { "type": "UncategorizedText", @@ -1261,7 +1261,7 @@ }, { "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "element_id": "8bf40d0515e8461bd30866c2eb8ac250", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1275,11 +1275,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "0" + "text": "4.6" }, { "type": "UncategorizedText", - "element_id": "8bf40d0515e8461bd30866c2eb8ac250", + "element_id": "c020bad937ece011339d7447ee0ac9fa", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1293,11 +1293,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "4.6" + "text": "2.8" }, { - "type": "Title", - "element_id": "51229f9593cbcb7c8e25059c004d67b0", + "type": "UncategorizedText", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1311,11 +1311,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "|| es" + "text": "0" }, { "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", + "element_id": "51229f9593cbcb7c8e25059c004d67b0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1329,11 +1329,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "C oal" + "text": "|| es" }, { "type": "Title", - "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", + "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1347,11 +1347,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Oil" + "text": "C oal" }, { "type": "Title", - "element_id": "3a21fb0158c2ea04834163deee74a836", + "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1365,11 +1365,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Bio m ass" + "text": "Oil" }, { "type": "Title", - "element_id": "4fabb98454d019811a732c4a09f31bf0", + "element_id": "3a21fb0158c2ea04834163deee74a836", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1383,11 +1383,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "N atural gas" + "text": "Bio m ass" }, { - "type": "UncategorizedText", - "element_id": "c020bad937ece011339d7447ee0ac9fa", + "type": "Title", + "element_id": "4fabb98454d019811a732c4a09f31bf0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1401,7 +1401,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "2.8" + "text": "N atural gas" }, { "type": "Title", @@ -2179,7 +2179,7 @@ }, { "type": "NarrativeText", - "element_id": "d85940c91ae6b53fc4b41bd5137e7371", + "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2193,11 +2193,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" + "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" }, { - "type": "NarrativeText", - "element_id": "26a84724035df76d7d8a6610a6fa4627", + "type": "Title", + "element_id": "5d7f49449ab22deac22d767b89549c55", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2211,11 +2211,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" + "text": "ii" }, { - "type": "NarrativeText", - "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5", + "type": "Title", + "element_id": "f5557d4fcf727a981a3c315aca733eef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2229,11 +2229,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" + "text": "iii" }, { - "type": "NarrativeText", - "element_id": "794a96b3ab9a3e860f65549c3a106704", + "type": "Title", + "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2247,11 +2247,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" + "text": "v" }, { - "type": "NarrativeText", - "element_id": "9a236889bced20048d1619798291d194", + "type": "Title", + "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2265,7 +2265,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" + "text": "vi" }, { "type": "NarrativeText", @@ -2286,8 +2286,8 @@ "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." }, { - "type": "Title", - "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", + "type": "NarrativeText", + "element_id": "794a96b3ab9a3e860f65549c3a106704", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2301,11 +2301,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "v" + "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" }, { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", + "type": "NarrativeText", + "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2319,7 +2319,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "vi" + "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" }, { "type": "NarrativeText", @@ -2340,8 +2340,8 @@ "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" }, { - "type": "Title", - "element_id": "f5557d4fcf727a981a3c315aca733eef", + "type": "NarrativeText", + "element_id": "d85940c91ae6b53fc4b41bd5137e7371", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2355,11 +2355,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "iii" + "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" }, { - "type": "Title", - "element_id": "5d7f49449ab22deac22d767b89549c55", + "type": "NarrativeText", + "element_id": "9a236889bced20048d1619798291d194", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2373,11 +2373,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "ii" + "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" }, { "type": "NarrativeText", - "element_id": "b6c39a9b3890b5132e4310c83d06b310", + "element_id": "26a84724035df76d7d8a6610a6fa4627", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2391,11 +2391,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." + "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" }, { - "type": "NarrativeText", - "element_id": "c328c06c32c00c43471cd3c9d257c68b", + "type": "Title", + "element_id": "6e98dee26ce2439cd4b8af82426e894e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2409,11 +2409,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" + "text": "understanding/statistics" }, { - "type": "NarrativeText", - "element_id": "6bbd046b939157389606adf4059fe1f3", + "type": "Title", + "element_id": "759772833f6756e511150b2a49233864", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2427,11 +2427,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" + "text": "professional/cancer-statistics/risk" }, { - "type": "NarrativeText", - "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c", + "type": "Title", + "element_id": "86c0a0cef7faa217f386f75ead17dbec", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2445,11 +2445,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." + "text": "sheets/detail/climate-change-and-health" }, { - "type": "NarrativeText", - "element_id": "d5658e2a49995a2f4ca4b45d95f2058b", + "type": "Title", + "element_id": "7267222b91f507e040c69dad9af7941f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2463,11 +2463,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" + "text": "the-full-costs-of-electricity-provision?details=true" }, { "type": "NarrativeText", - "element_id": "e4d7c811a799c3c8e706125556f8a370", + "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2481,11 +2481,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" + "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." }, { - "type": "Title", - "element_id": "6e98dee26ce2439cd4b8af82426e894e", + "type": "NarrativeText", + "element_id": "e4d7c811a799c3c8e706125556f8a370", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2499,11 +2499,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "understanding/statistics" + "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" }, { - "type": "Title", - "element_id": "759772833f6756e511150b2a49233864", + "type": "NarrativeText", + "element_id": "98e5f594de0e79990a0650489fdf295c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2517,11 +2517,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "professional/cancer-statistics/risk" + "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" }, { - "type": "Title", - "element_id": "7267222b91f507e040c69dad9af7941f", + "type": "NarrativeText", + "element_id": "d5658e2a49995a2f4ca4b45d95f2058b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2535,11 +2535,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "the-full-costs-of-electricity-provision?details=true" + "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" }, { "type": "NarrativeText", - "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5", + "element_id": "c328c06c32c00c43471cd3c9d257c68b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2553,11 +2553,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" + "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" }, { - "type": "Title", - "element_id": "86c0a0cef7faa217f386f75ead17dbec", + "type": "NarrativeText", + "element_id": "6bbd046b939157389606adf4059fe1f3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2571,11 +2571,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "sheets/detail/climate-change-and-health" + "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" }, { "type": "NarrativeText", - "element_id": "98e5f594de0e79990a0650489fdf295c", + "element_id": "b6c39a9b3890b5132e4310c83d06b310", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2589,7 +2589,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" + "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." }, { "type": "UncategorizedText", diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 0c5382f75d..3607a21cde 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -1,18 +1,62 @@ -from typing import List +import os +from typing import List, Tuple import numpy as np from unstructured.documents.elements import CoordinatesMetadata, Element from unstructured.logger import trace_logger -from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT +from unstructured.partition.utils.constants import ( + SORT_MODE_BASIC, + SORT_MODE_XY_CUT, +) from unstructured.partition.utils.xycut import recursive_xy_cut -def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> List[int]: +def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> Tuple[int, int, int, int]: + """ + Convert coordinates to a bounding box representation. + + Parameters: + coordinates (CoordinatesMetadata): Metadata containing points to represent the bounding box. + + Returns: + Tuple[int, int, int, int]: A tuple representing the bounding box in the format + (left, top, right, bottom). + """ + points = coordinates.points left, top = points[0] right, bottom = points[2] - return [int(left), int(top), int(right), int(bottom)] + return int(left), int(top), int(right), int(bottom) + + +def shrink_bbox(bbox: Tuple[int, int, int, int], shrink_factor) -> Tuple[int, int, int, int]: + """ + Shrink a bounding box by a given shrink factor while maintaining its center. + + Parameters: + bbox (Tuple[int, int, int, int]): The original bounding box represented by + (left, top, right, bottom). + shrink_factor (float): The factor by which to shrink the bounding box (0.0 to 1.0). + + Returns: + Tuple[int, int, int, int]: The shrunken bounding box represented by + (left, top, right, bottom). + """ + + left, top, right, bottom = bbox + width = right - left + height = bottom - top + new_width = width * shrink_factor + new_height = height * shrink_factor + dw = (width - new_width) / 2 + dh = (height - new_height) / 2 + + new_left = left + dw + new_right = right - dw + new_top = top + dh + new_bottom = bottom - dh + return int(new_left), int(new_top), int(new_right), int(new_bottom) def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool: @@ -37,6 +81,7 @@ def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool: def sort_page_elements( page_elements: List[Element], sort_mode: str = SORT_MODE_XY_CUT, + shrink_factor: float = 0.9, ) -> List[Element]: """ Sorts a list of page elements based on the specified sorting mode. @@ -57,6 +102,10 @@ def sort_page_elements( - List[Element]: A list of sorted page elements. """ + shrink_factor = float( + os.environ.get("UNSTRUCTURED_XY_CUT_BBOX_SHRINK_FACTOR", shrink_factor), + ) + if not page_elements: return [] @@ -82,9 +131,18 @@ def _coords_ok(strict_points: bool): if sort_mode == SORT_MODE_XY_CUT: if not _coords_ok(strict_points=True): return page_elements - boxes = [coordinates_to_bbox(coords) for coords in coordinates_list] + shrunken_bboxes = [] + for coords in coordinates_list: + bbox = coordinates_to_bbox(coords) + shrunken_bbox = shrink_bbox(bbox, shrink_factor) + shrunken_bboxes.append(shrunken_bbox) + res: List[int] = [] - recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res) + recursive_xy_cut( + np.asarray(shrunken_bboxes).astype(int), + np.arange(len(shrunken_bboxes)), + res, + ) sorted_page_elements = [page_elements[i] for i in res] elif sort_mode == SORT_MODE_BASIC: if not _coords_ok(strict_points=False): From 44f5605ef3b54febdf6252ecd6fb6626f2c40e7e Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 28 Sep 2023 21:48:19 -0700 Subject: [PATCH 14/31] build(image): call python3 not python for image compat (#1574) Fixes docker exec unstructured-smoke-test /bin/bash -c /home/notebook-user/test_unstructured_ingest/test-ingest-wikipedia.sh /home/notebook-user/test_unstructured_ingest/test-ingest-wikipedia.sh: line 10: python: command not found in https://github.com/Unstructured-IO/unstructured/blob/6ad4971/scripts/docker-smoke-test.sh#L43 that was preventing docker images from being built. --- test_unstructured_ingest/test-ingest-against-api.sh | 2 +- test_unstructured_ingest/test-ingest-airtable-diff.sh | 2 +- test_unstructured_ingest/test-ingest-airtable-large.sh | 2 +- test_unstructured_ingest/test-ingest-azure.sh | 2 +- test_unstructured_ingest/test-ingest-biomed-api.sh | 2 +- test_unstructured_ingest/test-ingest-biomed-path.sh | 2 +- test_unstructured_ingest/test-ingest-box.sh | 2 +- test_unstructured_ingest/test-ingest-confluence-diff.sh | 2 +- test_unstructured_ingest/test-ingest-confluence-large.sh | 2 +- test_unstructured_ingest/test-ingest-delta-table.sh | 2 +- test_unstructured_ingest/test-ingest-discord.sh | 2 +- test_unstructured_ingest/test-ingest-dropbox.sh | 2 +- test_unstructured_ingest/test-ingest-elasticsearch.sh | 2 +- test_unstructured_ingest/test-ingest-gcs.sh | 2 +- test_unstructured_ingest/test-ingest-github.sh | 2 +- test_unstructured_ingest/test-ingest-gitlab.sh | 2 +- test_unstructured_ingest/test-ingest-google-drive.sh | 2 +- test_unstructured_ingest/test-ingest-jira.sh | 2 +- .../test-ingest-local-single-file-with-encoding.sh | 2 +- ...t-ingest-local-single-file-with-pdf-infer-table-structure.sh | 2 +- test_unstructured_ingest/test-ingest-local-single-file.sh | 2 +- test_unstructured_ingest/test-ingest-local.sh | 2 +- test_unstructured_ingest/test-ingest-notion.sh | 2 +- test_unstructured_ingest/test-ingest-onedrive.sh | 2 +- test_unstructured_ingest/test-ingest-outlook.sh | 2 +- test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh | 2 +- test_unstructured_ingest/test-ingest-s3.sh | 2 +- test_unstructured_ingest/test-ingest-salesforce.sh | 2 +- test_unstructured_ingest/test-ingest-sharepoint.sh | 2 +- test_unstructured_ingest/test-ingest-slack.sh | 2 +- test_unstructured_ingest/test-ingest-wikipedia.sh | 2 +- 31 files changed, 31 insertions(+), 31 deletions(-) diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index 15200172d6..3f7a43d807 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -10,7 +10,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=api-ingest-output OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index e7b557a64d..11727e298a 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-airtable-large.sh b/test_unstructured_ingest/test-ingest-airtable-large.sh index 242dc36f47..a5a26be1cf 100755 --- a/test_unstructured_ingest/test-ingest-airtable-large.sh +++ b/test_unstructured_ingest/test-ingest-airtable-large.sh @@ -11,7 +11,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index 6d177bf43a..38e27294d9 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=azure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index 6137901914..0f09757d62 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=biomed-api OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index 9915d38d8e..49d2f2f72c 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=biomed-path OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-box.sh b/test_unstructured_ingest/test-ingest-box.sh index 20a167c862..08e6803066 100755 --- a/test_unstructured_ingest/test-ingest-box.sh +++ b/test_unstructured_ingest/test-ingest-box.sh @@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=box OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index 52ba2d5954..d785ff3a18 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -9,7 +9,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=confluence-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-confluence-large.sh b/test_unstructured_ingest/test-ingest-confluence-large.sh index cd686fe0a8..7a5114e340 100755 --- a/test_unstructured_ingest/test-ingest-confluence-large.sh +++ b/test_unstructured_ingest/test-ingest-confluence-large.sh @@ -11,7 +11,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=confluence-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index a952fc18fb..d019017a7a 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -8,7 +8,7 @@ OUTPUT_FOLDER_NAME=delta-table OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index d845d76f1e..b55e37ab9f 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=discord OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index 52514cb3d0..e58f5c6389 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=dropbox OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index 71dcf26a35..33a755b222 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -8,7 +8,7 @@ echo "SCRIPT_DIR: $SCRIPT_DIR" OUTPUT_FOLDER_NAME=elasticsearch OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index df87bb5c76..5827105dfb 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=gcs OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index 08cd9216f3..a81be26732 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=github OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index 537ceedc80..d8e7ce5fe9 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=gitlab OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index a259451bed..12d802fe48 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=google-drive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index 54f0930d11..3982141cb8 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=jira-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh index 1e93cce011..6442eec0b3 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-encoding OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh index b5aa2ce7a2..9d15a0e55c 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh index 090c29f87b..24954c1821 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-single-file OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-local.sh b/test_unstructured_ingest/test-ingest-local.sh index 5278d2812d..0e1b3856c1 100755 --- a/test_unstructured_ingest/test-ingest-local.sh +++ b/test_unstructured_ingest/test-ingest-local.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index 72e435cf02..b7e9c399f4 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=notion OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 41f7210f90..6e683351ed 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=onedrive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index 2be6b88b75..fdc3e90bfc 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=outlook OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index 46bfd32b19..a17c91b806 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=pdf-fast-reprocess OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME INPUT_PATH=$SCRIPT_DIR/download -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index 4b28a00106..c48941ab12 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -8,7 +8,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=s3 OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index 13b8018b76..a9ee1a106c 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -10,7 +10,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=salesforce OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 15913cdbcb..46fd041a8d 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=Sharepoint OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index 86a5b6a73d..e8974e1502 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=slack OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index 5cb127acbb..eb168aa731 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -7,7 +7,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=wikipedia OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")} +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh trap 'cleanup_dir "$OUTPUT_DIR"' EXIT From e0e329c68fd8f7f850a91ef55052659ff7ae7cd5 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 28 Sep 2023 22:26:23 -0700 Subject: [PATCH 15/31] build(release): cut release for 0.10.7 (#1575) --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5b030b315..793b5196cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev17 +## 0.10.17 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2f8c77de8b..ce80d06e56 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev17" # pragma: no cover +__version__ = "0.10.17" # pragma: no cover From 5b994f37ae441b37e3cc3e3e9b5e5bd14b8c890a Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 28 Sep 2023 23:09:18 -0700 Subject: [PATCH 16/31] build(release): actually make the release 0.10.18 (#1576) Workaround a publication issue to pypi. No code changes, 0.10.18 is the new 0.10.17. --- CHANGELOG.md | 2 +- setup.py | 7 +------ unstructured/__version__.py | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 793b5196cb..653464a23e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17 +## 0.10.18 ### Enhancements diff --git a/setup.py b/setup.py index 7b0b900c46..cbde874f62 100644 --- a/setup.py +++ b/setup.py @@ -106,12 +106,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List entry_points={ "console_scripts": ["unstructured-ingest=unstructured.ingest.main:main"], }, - install_requires=[ - # (Trevor): This is a simple hello world package that is used to track - # download count for this package using scarf. - 'scarf @ https://packages.unstructured.io/scarf.tgz', - load_requirements() - ], + install_requires=load_requirements(), extras_require={ # Document specific extra requirements "all-docs": all_doc_reqs, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce80d06e56..795e1f92bd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17" # pragma: no cover +__version__ = "0.10.18" # pragma: no cover From af7639e23f9ecedf6466915702edc14d70c20bf6 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 29 Sep 2023 13:42:21 -0500 Subject: [PATCH 17/31] ci: add retry to elastic search ingest test (#1581) Occasionally the es test can fail because the index fail to be created on the first try. Experiments show adding timeout doesn't help but add retry mitigates the issue. See history of commits in branch: yao/bump-inference-to-0.6.6 https://github.com/Unstructured-IO/unstructured/pull/1563 --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 2 ++ .../create_and_fill_es.py | 6 +++-- .../jira-diff/JCTP2/10010.json | 22 +++++++++---------- unstructured/__version__.py | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 653464a23e..f0fd3dd6c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +## 0.10.19-dev0 + ## 0.10.18 ### Enhancements diff --git a/scripts/elasticsearch-test-helpers/create_and_fill_es.py b/scripts/elasticsearch-test-helpers/create_and_fill_es.py index a63bd222d2..796e2187a8 100644 --- a/scripts/elasticsearch-test-helpers/create_and_fill_es.py +++ b/scripts/elasticsearch-test-helpers/create_and_fill_es.py @@ -10,12 +10,14 @@ ) print("Connecting to the Elasticsearch cluster.") -es = Elasticsearch(CLUSTER_URL) +es = Elasticsearch(CLUSTER_URL, request_timeout=30) print(es.info()) df = pd.read_csv(DATA_PATH).dropna().reset_index() print("Creating an Elasticsearch index for testing elasticsearch ingest.") -es.indices.create(index=INDEX_NAME, mappings=MAPPINGS) +response = es.options(max_retries=5).indices.create(index=INDEX_NAME, mappings=MAPPINGS) +if response.meta.status != 200: + raise RuntimeError("failed to create index") print("Loading data into the index.") bulk_data = [] diff --git a/test_unstructured_ingest/expected-structured-output/jira-diff/JCTP2/10010.json b/test_unstructured_ingest/expected-structured-output/jira-diff/JCTP2/10010.json index 371718d0b3..e6cb0c425c 100644 --- a/test_unstructured_ingest/expected-structured-output/jira-diff/JCTP2/10010.json +++ b/test_unstructured_ingest/expected-structured-output/jira-diff/JCTP2/10010.json @@ -10,7 +10,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -30,7 +30,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -50,7 +50,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -70,7 +70,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -90,7 +90,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -110,7 +110,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -130,7 +130,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -150,7 +150,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -170,7 +170,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -190,7 +190,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ @@ -210,7 +210,7 @@ "issue_key": "JCTP2-8" }, "date_created": "2023-08-22T11:35:48.407+0000", - "date_modified": "2023-08-29T11:46:18.193+0000" + "date_modified": "2023-09-29T05:55:11.066+0000" }, "filetype": "text/plain", "languages": [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 795e1f92bd..3703d5d96a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.18" # pragma: no cover +__version__ = "0.10.19-dev0" # pragma: no cover From ad59a879cc2a9b1074d58d80345a29b541389b6d Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 29 Sep 2023 14:09:57 -0500 Subject: [PATCH 18/31] chore: bump inference to 0.6.6 (#1563) - bump `unstructured-inference` to `0.6.6` - specify default model name for element detection to be `detectron2_onnx` to keep current behavior - NOTE: the updated inference package by default would use yolox as element detection model; this will be evaluated and enabled in a separated PR --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 11 +- requirements/constraints.in | 7 +- requirements/dev.txt | 12 +- requirements/extra-paddleocr.txt | 7 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 9 +- requirements/huggingface.txt | 2 +- requirements/ingest-airtable.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-box.txt | 2 +- requirements/ingest-gcs.txt | 3 +- requirements/ingest-github.txt | 2 +- requirements/ingest-google-drive.txt | 3 +- requirements/ingest-notion.txt | 22 +- requirements/ingest-onedrive.txt | 2 +- requirements/ingest-openai.txt | 17 +- requirements/ingest-outlook.txt | 2 +- requirements/ingest-salesforce.txt | 2 +- requirements/ingest-sharepoint.txt | 2 +- requirements/test.txt | 4 +- .../create-and-check-es.sh | 2 +- .../partition/pdf-image/test_image.py | 2 +- .../partition/pdf-image/test_pdf.py | 4 +- test_unstructured/partition/test_auto.py | 2 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 1298 +---------------- .../azure/IRS-form-1987.png.json | 19 - .../biomed-api/65/11/main.PMC6312790.pdf.json | 208 +-- .../biomed-api/75/29/main.PMC6312793.pdf.json | 462 +----- .../layout-parser-paper.pdf.json | 584 +------- .../2023-Jan-economic-outlook.pdf.json | 758 ++-------- .../small-pdf-set/Silent-Giant-(1).pdf.json | 270 +--- .../recalibrating-risk-report.pdf.json | 410 +----- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 6 +- 34 files changed, 333 insertions(+), 3809 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0fd3dd6c3..264cf9f44e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,13 @@ -## 0.10.19-dev0 +## 0.10.19-dev1 + +### Enhancements + +* **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. + +### Features + +### Fixes + ## 0.10.18 diff --git a/requirements/constraints.in b/requirements/constraints.in index 59f1d35dc2..19a6775177 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -39,5 +39,8 @@ matplotlib==3.7.2 # NOTE(crag) - pin to available pandas for python 3.8 (at least in CI) fsspec==2023.9.1 pandas<2.0.4 -# langchain limits this to 3.1.7 -anyio==3.1.7 +# langchain limits anyio to below 4.0 +anyio<4.0 +# pinned in unstructured paddleocr +opencv-python==4.8.0.76 +opencv-contrib-python==4.8.0.76 diff --git a/requirements/dev.txt b/requirements/dev.txt index f785ea00bd..b90b4776d6 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,8 +4,10 @@ # # pip-compile requirements/dev.in # -anyio==4.0.0 - # via jupyter-server +anyio==3.7.1 + # via + # -c requirements/constraints.in + # jupyter-server appnope==0.1.3 # via # ipykernel @@ -42,7 +44,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # -c requirements/test.txt # requests -cffi==1.15.1 +cffi==1.16.0 # via argon2-cffi-bindings cfgv==3.4.0 # via pre-commit @@ -151,7 +153,7 @@ jupyter-client==8.3.1 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.3.1 +jupyter-core==5.3.2 # via # -c requirements/constraints.in # ipykernel @@ -393,7 +395,7 @@ urllib3==1.26.16 # requests virtualenv==20.24.5 # via pre-commit -wcwidth==0.2.6 +wcwidth==0.2.7 # via prompt-toolkit webcolors==1.13 # via jsonschema diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index ada01fb2a6..1f028530d2 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -33,7 +33,7 @@ cssselect==1.2.0 # via premailer cssutils==2.7.1 # via premailer -cycler==0.11.0 +cycler==0.12.0 # via matplotlib cython==3.0.2 # via unstructured-paddleocr @@ -112,9 +112,12 @@ numpy==1.24.4 # unstructured-paddleocr # visualdl opencv-contrib-python==4.8.0.76 - # via unstructured-paddleocr + # via + # -c requirements/constraints.in + # unstructured-paddleocr opencv-python==4.8.0.76 # via + # -c requirements/constraints.in # imgaug # unstructured-paddleocr openpyxl==3.1.2 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index f22311f875..fa9cbcda5a 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -5,7 +5,7 @@ pdf2image pdfminer.six # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.5.31 +unstructured-inference==0.6.6 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index aebc5d8dee..679ffef54c 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via @@ -24,7 +24,7 @@ contourpy==1.1.1 # via matplotlib cryptography==41.0.4 # via pdfminer-six -cycler==0.11.0 +cycler==0.12.0 # via matplotlib effdet==0.4.1 # via layoutparser @@ -95,6 +95,7 @@ onnxruntime==1.16.0 # via unstructured-inference opencv-python==4.8.0.76 # via + # -c requirements/constraints.in # layoutparser # unstructured-inference packaging==23.1 @@ -213,7 +214,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.33.2 +transformers==4.33.3 # via unstructured-inference typing-extensions==4.8.0 # via @@ -224,7 +225,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.5.31 +unstructured-inference==0.6.6 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 03fdd36ce2..00ba71293a 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -97,7 +97,7 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.33.2 +transformers==4.33.3 # via -r requirements/huggingface.in typing-extensions==4.8.0 # via diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index 1b535a5db6..db7e92a6a1 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -21,7 +21,7 @@ inflection==0.5.1 # via pyairtable pyairtable==2.1.0.post1 # via -r requirements/ingest-airtable.in -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # pyairtable diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index e9eadb8deb..e682d29422 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -30,7 +30,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via # azure-datalake-store # cryptography diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index bc022a226c..79268b6b3d 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -15,7 +15,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 5533294fbc..4f6d048137 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -47,7 +47,7 @@ google-api-core==2.12.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.23.0 +google-auth==2.23.2 # via # gcsfs # google-api-core @@ -107,7 +107,6 @@ urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in - # google-auth # requests yarl==1.9.2 # via aiohttp diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 865778e014..ad5ac2a7a0 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via # cryptography # pynacl diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 993a01d200..9f90bcc9ca 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -19,7 +19,7 @@ google-api-core==2.12.0 # via google-api-python-client google-api-python-client==2.101.0 # via -r requirements/ingest-google-drive.in -google-auth==2.23.0 +google-auth==2.23.2 # via # google-api-core # google-api-python-client @@ -63,5 +63,4 @@ urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in - # google-auth # requests diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index fadccceea2..b200c2562e 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -4,33 +4,35 @@ # # pip-compile requirements/ingest-notion.in # -certifi==2023.7.22 +anyio==3.7.1 # via - # -c requirements/base.txt # -c requirements/constraints.in - # httpx -charset-normalizer==3.2.0 + # httpcore +certifi==2023.7.22 # via # -c requirements/base.txt + # -c requirements/constraints.in + # httpcore # httpx -h11==0.12.0 +exceptiongroup==1.1.3 + # via anyio +h11==0.14.0 # via httpcore htmlbuilder==1.0.0 # via -r requirements/ingest-notion.in -httpcore==0.13.3 +httpcore==0.18.0 # via httpx -httpx==0.20.0 +httpx==0.25.0 # via notion-client idna==3.4 # via # -c requirements/base.txt + # anyio # httpx - # rfc3986 notion-client==2.0.0 # via -r requirements/ingest-notion.in -rfc3986[idna2008]==1.5.0 - # via httpx sniffio==1.3.0 # via + # anyio # httpcore # httpx diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index cb5c5903cb..2d9627f1d4 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -15,7 +15,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index a2854493a0..d7846c0a08 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -10,6 +10,10 @@ aiohttp==3.8.5 # openai aiosignal==1.3.1 # via aiohttp +anyio==3.7.1 + # via + # -c requirements/constraints.in + # langchain async-timeout==4.0.3 # via # aiohttp @@ -30,6 +34,8 @@ dataclasses-json==0.6.1 # via # -c requirements/base.txt # langchain +exceptiongroup==1.1.3 + # via anyio frozenlist==1.4.0 # via # aiohttp @@ -37,9 +43,14 @@ frozenlist==1.4.0 idna==3.4 # via # -c requirements/base.txt + # anyio # requests # yarl -langchain==0.0.298 +jsonpatch==1.33 + # via langchain +jsonpointer==2.4 + # via jsonpatch +langchain==0.0.304 # via -r requirements/ingest-openai.in langsmith==0.0.41 # via langchain @@ -69,7 +80,7 @@ packaging==23.1 # via # -c requirements/base.txt # marshmallow -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # langchain @@ -87,6 +98,8 @@ requests==2.31.0 # langsmith # openai # tiktoken +sniffio==1.3.0 + # via anyio sqlalchemy==2.0.21 # via langchain tenacity==8.2.3 diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index 508d7573dd..ccef36d349 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index 116a00eab0..a6c31b1014 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 7a209b1042..99d1efbfde 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/test.txt b/requirements/test.txt index fe8dc02504..98d40fd188 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -74,7 +74,7 @@ pluggy==1.3.0 # via pytest pycodestyle==2.11.0 # via flake8 -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # -r requirements/test.in @@ -113,7 +113,7 @@ types-click==7.1.8 # via -r requirements/test.in types-markdown==3.4.2.10 # via -r requirements/test.in -types-requests==2.31.0.5 +types-requests==2.31.0.6 # via -r requirements/test.in types-tabulate==0.9.0.3 # via -r requirements/test.in diff --git a/scripts/elasticsearch-test-helpers/create-and-check-es.sh b/scripts/elasticsearch-test-helpers/create-and-check-es.sh index 62f7cb6b66..44fca2f7d3 100755 --- a/scripts/elasticsearch-test-helpers/create-and-check-es.sh +++ b/scripts/elasticsearch-test-helpers/create-and-check-es.sh @@ -9,7 +9,7 @@ docker run -d --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" - echo "Waiting for Elasticsearch container to start..." sleep 1 -url="http://localhost:9200/_cluster/health" +url="http://localhost:9200/_cluster/health?wait_for_status=green&timeout=50s" status_code=0 retry_count=0 max_retries=6 diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index ce026767a1..e2c9496356 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -440,7 +440,7 @@ def test_partition_image_formats_languages_for_tesseract(): ocr_languages="jpn_vert", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index d5dfcb8189..6cf4e93894 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -407,7 +407,7 @@ def test_partition_pdf_with_dpi(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", pdf_image_dpi=100, ) @@ -858,7 +858,7 @@ def test_partition_pdf_formats_languages_for_tesseract(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index f3e91e6a6b..dcacf01ba2 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -381,7 +381,7 @@ def test_auto_partition_formats_languages_for_tesseract(): ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 7cbf4decf9..b37185fd27 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "cf66bb0e9e68e3a82a99b5621e4394f8", + "element_id": "0b8804afbc4722108e877480e28462a6", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -16,30 +16,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Core Skills for Biomedical Data" + "text": "Core Skills for Biomedical Data Scientists" }, { - "type": "Title", - "element_id": "733383a5f0f5bdea71d6d48805365e6f", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Scientists" - }, - { - "type": "Title", - "element_id": "64b2134f054446d473fce1b05d4d4c94", + "type": "NarrativeText", + "element_id": "46b1e4dae5ffd7cdcb2a6ed9f206a8ee", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -77,159 +58,7 @@ }, { "type": "Title", - "element_id": "f089eaef57aba315bc0e1455985c0c8e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Michael" - }, - { - "type": "UncategorizedText", - "element_id": "fd0a559e715a134218c73276dc57d463", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "F." - }, - { - "type": "UncategorizedText", - "element_id": "44be44eccd482217c097571ddfa61f49", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Huerta," - }, - { - "type": "Title", - "element_id": "394df19f0626f36d12da449624b691f9", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "PhD, Associate" - }, - { - "type": "Title", - "element_id": "4f5a6389c571b0d01690b1db0349c1b4", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Director of" - }, - { - "type": "Title", - "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "NLM" - }, - { - "type": "Title", - "element_id": "237622d8c80fbdbe790b92d500aa7b00", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "for Program Development and" - }, - { - "type": "Title", - "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "NLM" - }, - { - "type": "Title", - "element_id": "ba490653e1ad81f341c35ae470c1b825", + "element_id": "d9644fb4b85468d186b132c91ca64f31", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -244,7 +73,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Coordinator of Data Science and Open Science Initiatives" + "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives" }, { "type": "Title", @@ -265,28 +94,9 @@ }, "text": "Executive Summary" }, - { - "type": "Title", - "element_id": "6712d87f1d156abf6171f700e2875889", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "biomedical" - }, { "type": "NarrativeText", - "element_id": "2364a6d2f9a3858d51d91b817732e6c9", + "element_id": "d6df9cd66da09d30c16d194e877766ca", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -301,11 +111,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs." + "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:" }, { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "type": "ListItem", + "element_id": "d94c6241299e6eff20ee6499cb9f64de", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -320,68 +130,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "data" + "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science; 2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python); 3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science; 4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science. 5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." }, { "type": "UncategorizedText", - "element_id": "50e891aa619a7ccbeab043789ca5dd1a", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "programs," - }, - { - "type": "Title", - "element_id": "6201111b83a0cb5b0922cb37cc442b9a", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "and" - }, - { - "type": "Title", - "element_id": "a703788f832056626d71b7db4d805524", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "minimal" - }, - { - "type": "Title", - "element_id": "6ee0eb490ff832101cf82a3d387c35f2", + "element_id": "34b28172088bba51c6764df6d4e87674", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -396,11 +149,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "set" + "text": "The report further details specific skills and expertise relevant to biomedical data scientists." }, { "type": "Title", - "element_id": "10c22bcf4c768b515be4e94bcafc71bf", + "element_id": "89b1f4c3df983454e25b233320781610", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -415,11 +168,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "for" + "text": "Motivation" }, { - "type": "Title", - "element_id": "28391d3bc64ec15cbb090426b04aa6b7", + "type": "NarrativeText", + "element_id": "cfe4cc76625dc82267d95ec1dc7e7813", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -434,11 +187,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "of" + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" }, { - "type": "Title", - "element_id": "6712d87f1d156abf6171f700e2875889", + "type": "UncategorizedText", + "element_id": "68431de56564c6ad6aa3e6c02b78c89c", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -451,13 +204,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "biomedical" + "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" }, { "type": "Title", - "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", + "element_id": "edd5f2f5a60a83c8899e533ac8bcd03c", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -470,13 +223,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "core" + "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." }, { "type": "Title", - "element_id": "50c5080f67ea1f9eff473e46e6314fd2", + "element_id": "3c36cd10b2e64b9f2169f05abddd4981", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -489,13 +242,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "skills for biomedical" + "text": "Methodology" }, { "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "element_id": "987542acede56f098db655f02fb814a7", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -508,13 +261,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "data" + "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:" }, { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "type": "ListItem", + "element_id": "fdd38e2d80cc964e9bf3c7e09a760e21", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -527,982 +280,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "data" + "text": "a) Responses to a 2017 Kaggle' survey’ of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (83.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The" }, { "type": "NarrativeText", - "element_id": "18f107bf25f694db07b6aba0a5aaf321", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Suggested high-level core skills include:" - }, - { - "type": "ListItem", - "element_id": "8f90f5970c85f335b1bf50af611ce5c5", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;" - }, - { - "type": "ListItem", - "element_id": "d1a5bb898aee8de0fbdf048c7a9fb01d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "2. Programming language expertise: biomedical data scientists should be fluent in at" - }, - { - "type": "Title", - "element_id": "18e42d24d6449a9b52fc65fc3f9710b4", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "least one programming language (typically R and/or Python);" - }, - { - "type": "ListItem", - "element_id": "c6be5389b7bd00746d39b7bac468dea0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;" - }, - { - "type": "ListItem", - "element_id": "1b8039583cbc15f654c89f2141eb6e10", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science." - }, - { - "type": "ListItem", - "element_id": "2f87757b1d497a32c077be543632ed7d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." - }, - { - "type": "UncategorizedText", - "element_id": "34b28172088bba51c6764df6d4e87674", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "The report further details specific skills and expertise relevant to biomedical data scientists." - }, - { - "type": "Title", - "element_id": "89b1f4c3df983454e25b233320781610", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Motivation" - }, - { - "type": "NarrativeText", - "element_id": "3d8fbacaba9067faef48850d43801268", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2k) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" - }, - { - "type": "UncategorizedText", - "element_id": "68431de56564c6ad6aa3e6c02b78c89c", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" - }, - { - "type": "NarrativeText", - "element_id": "326e7d081e9418423ea62bf3802caaa3", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "this commitment, recent report to the NLM Director recommended working across identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." - }, - { - "type": "Title", - "element_id": "ca978112ca1bbdcafac231b39a23dc4d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a" - }, - { - "type": "Title", - "element_id": "acc8586a874eb74f10c3f90620f20617", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "NIH to" - }, - { - "type": "Title", - "element_id": "f26d07e6b71e42596791a241e2417931", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Methodology" - }, - { - "type": "Title", - "element_id": "b344d80e24a3679999fa964450b34bc2", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The" - }, - { - "type": "Title", - "element_id": "aa3b88196a6407c3866c85acdcc8c981", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Workforce" - }, - { - "type": "NarrativeText", - "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "required of" - }, - { - "type": "NarrativeText", - "element_id": "b72b62f1295c66f199256c1190177ce6", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "three-pronged approach biomedical data scientist (BDS), drawing from:" - }, - { - "type": "Title", - "element_id": "3d366201f5b88bcbfafb078aee5f2a55", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Excellence" - }, - { - "type": "Title", - "element_id": "ca8b22d0db83a22db163b560b3e4e515", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "team" - }, - { - "type": "NarrativeText", - "element_id": "e0a6230e370d20dece7ca96c77611cb0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "took" - }, - { - "type": "Title", - "element_id": "ca978112ca1bbdcafac231b39a23dc4d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a" - }, - { - "type": "Title", - "element_id": "663ea1bfffe5038f3f0cf667f14c4257", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "to" - }, - { - "type": "NarrativeText", - "element_id": "a5bed2020bd1f4ea3eca933398c4f0d0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "identifying" - }, - { - "type": "Title", - "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "core" - }, - { - "type": "Title", - "element_id": "32c1cf49a2feee269ed74dd860f72644", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "skills" - }, - { - "type": "NarrativeText", - "element_id": "a24acaf1cb5d6f8a0a0af0e81949765b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." - }, - { - "type": "Title", - "element_id": "301d35f1042e1eac9fdef8839fd13a4e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "b)" - }, - { - "type": "Title", - "element_id": "6b847a0ed0b2c484c73f2749e29b4db5", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "into" - }, - { - "type": "NarrativeText", - "element_id": "1117af46b0a22dd02d3869ab9738a8a8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." - }, - { - "type": "NarrativeText", - "element_id": "b63b99f6383ba713b57ddfc77737c5f7", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "was" - }, - { - "type": "Title", - "element_id": "936e5cc5021d8a075f91b7864bf0cec8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "courses" - }, - { - "type": "UncategorizedText", - "element_id": "6b51d431df5d7f141cbececcf79edf3d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "12" - }, - { - "type": "Title", - "element_id": "2d2e9ceb1db2bc94a266f3e8b24b8f55", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "c)" - }, - { - "type": "Title", - "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Desired" - }, - { - "type": "NarrativeText", - "element_id": "f9c94ebffe2ab721a096cf42b7a9cff9", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "important skills that were mentioned multiple times in" - }, - { - "type": "NarrativeText", - "element_id": "961a38da2886c3cc25091d912769aa0d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad." - }, - { - "type": "Title", - "element_id": "32c1cf49a2feee269ed74dd860f72644", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "skills" - }, - { - "type": "NarrativeText", - "element_id": "a486fbc90cd5a32fe44275f5948b2066", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "identified" - }, - { - "type": "Title", - "element_id": "de98e5ea566225a14a9a6b3086253f6d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "academia" - }, - { - "type": "Title", - "element_id": "75857a45899985be4c4d941e90b6b396", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "from" - }, - { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "data" - }, - { - "type": "Title", - "element_id": "8b3a4555f5297c340e5fdff392fe5a5b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "science-related" - }, - { - "type": "Title", - "element_id": "26f8fe3e12ff690c91f73b24bb45ed01", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "utilized" - }, - { - "type": "Title", - "element_id": "b510c96f289ebcf388da7d2dea6a1e73", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "ads." - }, - { - "type": "Title", - "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "the" - }, - { - "type": "UncategorizedText", - "element_id": "3e1e967e9b793e908f8eae83c74dba9b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "59" - }, - { - "type": "Title", - "element_id": "788eb2efc52660fe41472319f0d2c623", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "ads" - }, - { - "type": "Title", - "element_id": "9d5d7fcf3aa35a4809f92551aed1f26e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "sector" - }, - { - "type": "Title", - "element_id": "75857a45899985be4c4d941e90b6b396", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "from" - }, - { - "type": "Title", - "element_id": "9f25a5b0f5e247294ebcf6723c2169b2", + "element_id": "3f14cc0782485365bad0539f7b1bbb22", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1517,11 +301,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "for core skills necessary for" + "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad." }, { "type": "NarrativeText", - "element_id": "f7f4976ebe430b482f073e28add58182", + "element_id": "c2e95867ed0f25e3d9fe1a6b97447ab9", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1536,11 +320,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations competitive biomedical data scientist." + "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist." }, { "type": "NarrativeText", - "element_id": "4a99b0f26eb7267230c6994d9ab7d60b", + "element_id": "8e6dc8d9bc74e032451cc1a6a0da4d10", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1555,7 +339,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "' Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com ? In August 2017, Kaggle conducted an industry-wide survey to gain clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" + "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 5afaa3fefc..2034680177 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -360,25 +360,6 @@ }, "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." }, - { - "type": "NarrativeText", - "element_id": "7685df2334a5f6c8c8099dea61a8f1b4", - "metadata": { - "data_source": { - "url": "abfs://container1/IRS-form-1987.png", - "version": 328871203465633719836776597535876541325, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/IRS-form-1987.png" - }, - "date_created": "2023-03-10T09:44:55+00:00", - "date_modified": "2023-03-10T09:44:55+00:00" - }, - "filetype": "image/png", - "page_number": 1 - }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." - }, { "type": "Title", "element_id": "5756fb398995bb6518a87637f24f426e", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 9c98b4af47..35d4a581e4 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "Title", + "element_id": "0ca3f075fdccf9232449ff461b63ceb9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -161,63 +161,23 @@ }, { "type": "Title", - "element_id": "b877cc5d670d770084dcc0bb41ac73a0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Subject area More specific subject area Type of data" - }, - { - "type": "Title", - "element_id": "b27e559f6c00d2bde61efba5db252e31", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Materials engineering" - }, - { - "type": "Title", - "element_id": "a2c3879ecb580742973c6a914fb905bb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Surface science and engineering" - }, - { - "type": "Title", - "element_id": "1064dcef42380cfdb90c668aa3a670a3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Table and figure" - }, - { - "type": "Title", - "element_id": "e4359c72057b318ddf5a64f9b97539c4", + "element_id": "ac89a2886224c42ad15982cd34421ff8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za" + "text": "Subject area More specific subject area Surface science and engineering Type of data" }, { - "type": "Title", - "element_id": "e102dc7c1db28c29d5e4bde8062592ed", + "type": "NarrativeText", + "element_id": "0a789b33a0101a46f5a01d22d9a6ce2b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "E-mail address: tayo.sanni@yahoo.com (O. Sanni)." + "text": "* Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za E-mail address: tayo.sanni@yahoo.com (O. Sanni)." }, { "type": "NarrativeText", @@ -310,44 +270,14 @@ "text": "Value of the data" }, { - "type": "NarrativeText", - "element_id": "682e6210329b84f8b00548088196ffc9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment." - }, - { - "type": "NarrativeText", - "element_id": "1d61e3468bc681ba1a7e647000c6828c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments." - }, - { - "type": "NarrativeText", - "element_id": "39b6040280a179e1f8e4f4fb5ec4ae05", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the" - }, - { - "type": "Title", - "element_id": "1ddde62c3188f81dfc835b6f036f1734", + "type": "ListItem", + "element_id": "7def44ffc91f3f064b85dc04b23767ec", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "nature of inhibition of metals." + "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment. © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments. © The data can be used to examine the relationship between the process variable as it affect the nature of inhibition of metals." }, { "type": "Title", @@ -529,25 +459,15 @@ }, "text": "Exposure Time (Hours)" }, - { - "type": "UncategorizedText", - "element_id": "25db7b1d2f5780559e1034d72bcb4050", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES." - }, { "type": "NarrativeText", - "element_id": "cbd563dd2fcd7d0b5a0b2173465fd328", + "element_id": "45cd54c64e38abe8c1128a5979ca8cd5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "immersed in 0.5 M H2SO4 solution in the absence and" + "text": "Fig. 1. Weight loss versus exposure time for stainless steel immersed in 0.5M H2SO, solution in the absence and presence of ES." }, { "type": "NarrativeText", @@ -1080,14 +1000,14 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "FigureCaption", - "element_id": "27b45633a0f31b9e01d179d70d7dc282", + "type": "Image", + "element_id": "b5ee6af3d776b0bbd2e581a3ab2ab2e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "5 1 os = — 10; =o ° © —\" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)" + "text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)" }, { "type": "UncategorizedText", @@ -1450,34 +1370,34 @@ "text": "455" }, { - "type": "FigureCaption", - "element_id": "273fb301b173075f79b2cbdab962e2ff", + "type": "Image", + "element_id": "caa364fead90039aae1f13d64dcb8b37", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEM HV: Q0KY WD: 14.89 rmrm ‘9EM MAO: 209 x Det: DOE Pectomsence In nanospact" + "text": "SEM HV: Q0KY WD: 14.89 rmrm‘DEM MAO: 209 x ‘Dor Pecforsence In nenospact" }, { - "type": "FigureCaption", - "element_id": "520d1da08c86ce165cd2843e2dc27f98", + "type": "Image", + "element_id": "a0463ca888a6f2c8c3ba40ba47be0f2f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE" + "text": "gEOOwaeSemny. z00RV | WD: 1424 renn rtirint VEoa3 Tescan20 yin Fertormaros in nancepace|" }, { - "type": "FigureCaption", - "element_id": "d04d110c16a4ebc184fa130f09b8d423", + "type": "Image", + "element_id": "88301d6b47b17df03b78789b9890a6f1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "Sem ny. 200 Rv" + "text": "°@¢Naafe«MgsSEM HY: 20.0KV 7 ETOP LU ULL UL OCT 0BEM IAAG: 400 x a" }, { "type": "NarrativeText", @@ -1530,7 +1450,7 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "ListItem", + "type": "Title", "element_id": "a80826543c9e0d0e9f6c2108ae3c3f73", "metadata": { "data_source": {}, @@ -1560,17 +1480,7 @@ "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9." }, { - "type": "FigureCaption", - "element_id": "060e14f01e484ba252e902cd5c6f94f9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "ou H,;COCHNY OH" - }, - { - "type": "Title", + "type": "NarrativeText", "element_id": "1dc2692eee9b01e9a960f80c4dabe07b", "metadata": { "data_source": {}, @@ -1890,74 +1800,14 @@ "text": "References" }, { - "type": "NarrativeText", - "element_id": "d844a31ead19b2e2fae786d2a5495072", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution" - }, - { - "type": "NarrativeText", - "element_id": "d0be94eaaf9c0f43bc51381f031e1381", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "using eco-friendly waste product, Results Phys. 9 (2018) 225–230." - }, - { - "type": "NarrativeText", - "element_id": "7e9cfcc1c32c353e319aae7d9be537bd", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion" - }, - { - "type": "NarrativeText", - "element_id": "c00e8be0806aa2ded72da0ef746a4291", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15." - }, - { - "type": "NarrativeText", - "element_id": "1d76a4bb6ba7984cea4548ab574beb8f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel" - }, - { - "type": "NarrativeText", - "element_id": "ffd9e4babdf76600a881851ebbf35d3f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "corrosion in chloride solution, Def. Technol. 14 (2018) 463–468." - }, - { - "type": "NarrativeText", - "element_id": "dd7f4838500dd709556225fa3f6b7339", + "type": "ListItem", + "element_id": "86174db2f99ff948055caeda83334bb7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5." + "text": "[1] 0. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results Phys. 9 (2018) 225-230. [2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1-15. [3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel corrosion in chloride solution, Def. Technol. 14 (2018) 463-468. [4] O. Sanni, A.P.I. Popoola, 0.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1-17. https://doi.org/10.1007/ $13632-018-0495-5, [5] O. Sanni, A-P.I. Popoola, O.S.1. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. (lnttps://doi.org/10.7449/2018/MST_2018_254 261)." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index d7bdce8ec2..5d4295f490 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "Title", + "element_id": "0ca3f075fdccf9232449ff461b63ceb9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -250,77 +250,27 @@ "text": "Value of the data" }, { - "type": "NarrativeText", - "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" - }, - { - "type": "Title", - "element_id": "bd7d750cb9f652c80c17a264072b8858", + "type": "ListItem", + "element_id": "510d0bce379a0d3ba5ff46d536bdb7c5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "performance of the algorithms for the MDVSP." + "text": "© The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the performance of the algorithms for the MDVSP. © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations. e All the problem instances are available for use without any restrictions. e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison. © The dataset includes a program that can generate similar problem instances of different sizes." }, { "type": "NarrativeText", - "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", + "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" + "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" }, { "type": "Title", - "element_id": "68d39f7bcfe99749cc221fa901314626", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "mathematical formulations." - }, - { - "type": "NarrativeText", - "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" - }, - { - "type": "NarrativeText", - "element_id": "24d7f2ed4386a169639b93a5bf03fd79", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "be used for the comparison." - }, - { - "type": "NarrativeText", - "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." - }, - { - "type": "ListItem", "element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa", "metadata": { "data_source": {}, @@ -330,134 +280,24 @@ "text": "1. Data" }, { - "type": "NarrativeText", - "element_id": "41ce7670e476aaf9a595bc28c13dbba0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number" - }, - { - "type": "Title", - "element_id": "10c22bcf4c768b515be4e94bcafc71bf", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "for" - }, - { - "type": "NarrativeText", - "element_id": "a18c70d23b71c51ddfe33311232c241c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." - }, - { - "type": "UncategorizedText", - "element_id": "aea66a7c89c6de4d3e3ed6c1ada31104", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "the size," - }, - { - "type": "UncategorizedText", - "element_id": "e0feab8a8888b2955af1cc1a2acff883", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "‘ðm; nÞ’," - }, - { - "type": "UncategorizedText", - "element_id": "0b113c91aaaf031e5d7b74747e1b4153", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "respectively. For example," - }, - { - "type": "UncategorizedText", - "element_id": "6dd3e9101394a1fbacb451c4c9ba03b9", + "type": "ListItem", + "element_id": "86e53159056da85c215281a9c68d46b9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "the problem instance," + "text": "For each problem instance, the following information is provided: The number of depots (m), The number of trips (n), The number of locations (I), The number of vehicles at each depot, For each tripie 1,2,...,n,a start time, ft}, an end time, ff, a start location, i, and an end location, i, and" }, { "type": "NarrativeText", - "element_id": "33d26eae1edf215a9677101c7147d671", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip i A 1; 2; …; n, a start time, ts" - }, - { - "type": "UncategorizedText", - "element_id": "c6490fc185478150e7816c45ef8a48d5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Þ," - }, - { - "type": "Title", - "element_id": "5a15b4000add06e52b66591cd8cac950", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "i , an end time, te" - }, - { - "type": "Title", - "element_id": "7798ae4daad9264de38e67c98f2bd624", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "i , a start location, ls" - }, - { - "type": "UncategorizedText", - "element_id": "801a0d00a5b76dbd0f039368ee45eda3", + "element_id": "07732da32c53fed3ffd5342c61ab643b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "i , and an end location, le i ," - }, - { - "type": "Title", - "element_id": "6201111b83a0cb5b0922cb37cc442b9a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "and" + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." }, { "type": "NarrativeText", @@ -511,13 +351,13 @@ }, { "type": "NarrativeText", - "element_id": "faee1001fc912565a74ea2d69fa0d689", + "element_id": "694b9c582265698bf49806b056c64adc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "travel empty from —¢). Aschedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" + "text": "j , the vehicle must travel empty from le j (cid:3)te i Þ. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" }, { "type": "NarrativeText", @@ -529,76 +369,6 @@ }, "text": "A trip j can be covered after trip i by the same vehicle, if ts j" }, - { - "type": "NarrativeText", - "element_id": "3e549e73bba49a63f20841b5821cfda9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i to ls" - }, - { - "type": "NarrativeText", - "element_id": "43dad32a26a446c5a2c74f3f2328b849", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": ". If le i ls le i j , otherwise, the vehicle may require waiting at le i for the duration of ðts" - }, - { - "type": "Title", - "element_id": "3feb623147ddb3265b5968ce2efb8f6b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "Z te" - }, - { - "type": "NarrativeText", - "element_id": "5201e1037409ea15055e320409a9f5eb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i þδ" - }, - { - "type": "Title", - "element_id": "189f40034be7a199f1fa9891668ee3ab", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "j" - }, - { - "type": "Title", - "element_id": "a10959d132f2b0d3723ae6b8b77f86b7", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "a ls" - }, - { - "type": "Title", - "element_id": "4137b01e139589b7a1d3b3fc4da031d8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "must" - }, { "type": "ListItem", "element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8", @@ -629,65 +399,25 @@ }, "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." }, - { - "type": "Title", - "element_id": "252f10c83610ebca1a059c0bae8255eb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "f" - }, { "type": "NarrativeText", - "element_id": "928fa0dcad70f173bc989ee5715375c5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" - }, - { - "type": "UncategorizedText", - "element_id": "89507815c6b4a6f31e6d3da7fca6b561", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(cid:1)" - }, - { - "type": "UncategorizedText", - "element_id": "33a2b57b388470db1cb13defbe73dc18", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(cid:3)" - }, - { - "type": "UncategorizedText", - "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", + "element_id": "e731dc92fddc0512e142bfb2bed62bbf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "." + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." }, { "type": "NarrativeText", - "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "element_id": "92b491d0e108ec13f263b16646ecac65", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." + "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots (m), the number of trips, (n), and the number of locations (I), in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, ie{1,...,n}, and provides the start location, the start time, the end location, and the end time of trip i. The next | lines present the travel times between any two locations, i,j e {1, wal}. The dataset also includes a program ‘Generatelnstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots (m), the number of trips (n), and the" }, { "type": "UncategorizedText", @@ -850,34 +580,14 @@ "text": "Table 2 Description of file format for each problem instance." }, { - "type": "Title", - "element_id": "151e509ce97fe40eecae3822c78adcf5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "Number of lines" - }, - { - "type": "Title", - "element_id": "0d42fdb9458af19413eee0a1227f415c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "Number of columns in each line" - }, - { - "type": "Title", - "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", + "type": "NarrativeText", + "element_id": "444f48f6d4f0ee6d3a04b7bf76218980", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Description" + "text": "Number of Number of columns in Description lines each line" }, { "type": "UncategorizedText", @@ -920,47 +630,17 @@ "text": "l" }, { - "type": "Title", - "element_id": "336074805fc853987abe6f7fe3ad97a6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "time" - }, - { - "type": "NarrativeText", - "element_id": "78f6ff03dfac8dfb7f319de1e369590d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." - }, - { - "type": "Title", - "element_id": "8ee69286d5f681913dbfdeb60bedc572", + "type": "ListItem", + "element_id": "f096a8499e50cac1f45ceb8340dace5a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "i , the end location le" + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j." }, { "type": "Title", - "element_id": "08238905e7bba7115b7d7d58fef13ec6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "i , the start" - }, - { - "type": "ListItem", "element_id": "764eef872135149aaf95224bab69c844", "metadata": { "data_source": {}, @@ -1031,102 +711,12 @@ }, { "type": "NarrativeText", - "element_id": "5a1d84f7d74fc4ceeacb634d524cc041", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling" - }, - { - "type": "UncategorizedText", - "element_id": "bec40b25a277a08de3415e33284fc76d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "problem, Networks 19 (5) (1989) 531–548." - }, - { - "type": "NarrativeText", - "element_id": "19dee0a4e8fd073350e234b4352b8af6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur." - }, - { - "type": "UncategorizedText", - "element_id": "5f5ca82752a3220998c06ea0c44eb80e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "J. Oper. Res. 175 (3) (2006) 1616–1627." - }, - { - "type": "UncategorizedText", - "element_id": "64cd13c78330953bd999d37dacbeaf0e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic" - }, - { - "type": "NarrativeText", - "element_id": "c4f2c64b5f38feaa921647abceebaec8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487." - }, - { - "type": "NarrativeText", - "element_id": "16c341408703257ff517dcc76140e2c0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling" - }, - { - "type": "UncategorizedText", - "element_id": "aa252076bc877d1ba2b95aa13b73ff72", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "problem, J. Sched. 12 (1) (2009) 17." - }, - { - "type": "UncategorizedText", - "element_id": "2e00441177bee9377583470218bea299", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1)" - }, - { - "type": "UncategorizedText", - "element_id": "4b1b8c9df00f25e26176a85d84c8c927", + "element_id": "ba0af0b44e7cc27de119a1771c07dfc2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "(1994) 41–52." + "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling problem, Networks 19 (5) (1989) 531-548. [2] N. Kliewer, T. Mellouli, L. Suhl, A time-space network based exact optimization model for multi-depot bus scheduling, Eur. J. Oper. Res. 175 (3) (2006) 1616-1627. [3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457-487. [4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling problem, J. Sched. 12 (1) (2009) 17. [5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) (1994) 41-52." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 30302a3ffa..99b11a3a14 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -169,16 +169,6 @@ }, "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects." }, - { - "type": "NarrativeText", - "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." - }, { "type": "NarrativeText", "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", @@ -201,73 +191,23 @@ }, { "type": "ListItem", - "element_id": "074b2bd4ba1bf0caf3dbf1973217416a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character" - }, - { - "type": "ListItem", - "element_id": "569ce8891b02bc38f50a0cde0039e951", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that" - }, - { - "type": "ListItem", - "element_id": "18dcbc2839f9783d2c91cbce75d3e685", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "3. Comprehensive tools for efficient document image data annotation and model" - }, - { - "type": "ListItem", - "element_id": "efe6ba3afae54e3c7a05d81583543296", + "element_id": "dc2c331204369d29f5bdcd8dc88a8174", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)" - }, - { - "type": "Title", - "element_id": "50f59772d4134ececeaf37069d480784", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "underlies the off-the-shelf usage" - }, - { - "type": "Title", - "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "recognition, and other DIA tasks (Section 3)" + "text": "1. An off-the-shelf toolkit for applying DL models for recognition, and other DIA tasks (Section Bp ayout det ection, character 2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage 3. Comprehensive tools for efficient document image tuning to support different levels of customization 4. A DL model hub and community platform for t tion, and discussion of DIA models and pipeline: reproducibility, and extensibility (Section [4) ne easy S. ata annotation and model haring, distribu- s, to promote reusability," }, { "type": "NarrativeText", - "element_id": "9a576fe6eb4355cdf1e772cf462a9eb7", + "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "tuning to support different levels of customization" + "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." }, { "type": "NarrativeText", @@ -496,7 +436,7 @@ "data_source": {}, "filetype": "application/pdf", "page_number": 5, - "text_as_html": "
Dataset| Base Model'|Large ModelNotes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" + "text_as_html": "
Dataset| Base Model'|Large Model| Notes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" }, "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" }, @@ -591,34 +531,14 @@ "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." }, { - "type": "NarrativeText", - "element_id": "65f9f864775ddef6f9895c53e16c50d4", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel (" - }, - { - "type": "Title", - "element_id": "61b33f079528d200f91471f41645cdc6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "4 5 layout = model . detect ( image )" - }, - { - "type": "NarrativeText", - "element_id": "6cd3a9e132c1264a05ec11a2df6b8066", + "type": "ListItem", + "element_id": "e416e69991bf6a4b338df18ebdb6e712", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "\" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )" + "text": "import layoutparser as lp image = cv2.imread(\"image_file\") # load images model = lp.Detectron2LayoutModel ( \"1p://PubLayNet/faster_rcnn_R_50_FPN_3x/config\") layout = model.detect (image)" }, { "type": "NarrativeText", @@ -651,14 +571,14 @@ "text": "Z. Shen et al." }, { - "type": "FigureCaption", - "element_id": "185e67615d123b35d38ea72e0cdb6d99", + "type": "Image", + "element_id": "2f498bdd91739a7083490999507420a5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" + "text": "33§3 fectange vada8883 Coordinate83 +*Block | [Block | [Read8 Extra features Tet | [Tye | [oder[ coordinatel textblock1 |» , see383 , textblock2 , layout] ]4A list of the layout elementsThe same transformation and operation APIs" }, { "type": "Title", @@ -1102,14 +1022,14 @@ "text": "9" }, { - "type": "FigureCaption", - "element_id": "975d6cb141cb0a0313375630ae063fa8", + "type": "Image", + "element_id": "6df6057f894a166cf24fd34f64267f09", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position" + "text": "a ESStee eaeoooMode I: Showing Layout on the Original ImageMode Il: Drawing OCR'd Text at the Correspoding Position10g Bpunog vayoy feyds1q :1 vondo‘xog Burpunog vay apiH z word" }, { "type": "NarrativeText", @@ -1172,14 +1092,14 @@ "text": "Z. Shen et al." }, { - "type": "FigureCaption", - "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b", + "type": "Image", + "element_id": "cd0055b04f6049e9d9bf49a4f309f7e9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance" + "text": "Text‘Token CategoriestieAddress(Numberig:3pio Bupeas uwunjog(a) Illustration of the original Japanese document with detected layout elements highlighted in colored boxesColumn CategoriesCRE) OR REKER te setPikes enceee+41ybiay pamoyy wnwrxey(b) Illustration of the recreated document with dense text structure for better OCR performance" }, { "type": "NarrativeText", @@ -1302,14 +1222,14 @@ "text": "The digitization of historical documents can unlock valuable data that can shed light on many important social, economic, and historical questions. Yet due to scan noises, page wearing, and the prevalence of complicated layout structures, ob- taining a structured representation of historical document scans is often extremely complicated. In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese firm financial ta- bles with complicated layouts. The pipeline applies two layout models to identify different levels of document structures and two customized OCR engines for optimized character recog- nition accuracy." }, { - "type": "FigureCaption", - "element_id": "b33b2bc3b9c416673c7f74c6a00c49d8", + "type": "Image", + "element_id": "d32d5d93079c0053b7ef655185e47bb4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules" + "text": "Annotate Layout Dataset(spe peepee,Active Learning LayoutAnnotation Toolkit4Layout Detection<—Deep Learning LayoutModel Training & Inference,4Post-processin Handy Data Structures &pl 9 APIs for Layout DataText Recognition Default and Customized: r OCR ModelsVisualization & Export |], bayou StructureVisualization & StorageThe Japanese DocumentDigitization PipelineHelpful LayoutParserModules" }, { "type": "NarrativeText", @@ -1323,23 +1243,13 @@ }, { "type": "NarrativeText", - "element_id": "4005fd5e1a8a65c8e989071255cd7386", + "element_id": "de8f09a4156ca73defac521bb354a297", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "15 A document page consists of eight rows like this. For simplicity we skip the row" - }, - { - "type": "Title", - "element_id": "5d0786de7b188a10caffb32c951327a2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 11 - }, - "text": "segmentation discussion and refer readers to the source code when available." + "text": "& document page consists of eight rows like this. For simplicity we skip the row segmentation discussion and refer readers to the source code when available." }, { "type": "UncategorizedText", @@ -1412,44 +1322,14 @@ "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." }, { - "type": "NarrativeText", - "element_id": "d11adbfd88959ce24fbfdc7f8155e777", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "16 This measures the overlap between the detected and ground-truth characters, and" - }, - { - "type": "NarrativeText", - "element_id": "5b6b4f6a5766bdb4f09f0a0387a3a373", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "the maximum is 1." - }, - { - "type": "NarrativeText", - "element_id": "48033291e6d72fefde1a56827e6dacfb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "17 This measures the number of edits from the ground-truth text to the predicted text," - }, - { - "type": "NarrativeText", - "element_id": "5737ba23368c5333b0c39f7e8e474d03", + "type": "ListItem", + "element_id": "122f0a4bde97c6e10e95c6e54479e34e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "and lower is better." + "text": "16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1. '7 This measures the number of edits from the ground-truth text to the predicted text, and lower is better." }, { "type": "Title", @@ -1472,14 +1352,14 @@ "text": "13" }, { - "type": "FigureCaption", - "element_id": "7d42bb6af1404a95a6e8870d5c4d07bf", + "type": "Image", + "element_id": "f58d47bde7ebddd81c4a678c918a8f1b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 13 }, - "text": "(@) Partial table at the bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" + "text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" }, { "type": "NarrativeText", @@ -1592,84 +1472,14 @@ "text": "References" }, { - "type": "UncategorizedText", - "element_id": "b5bf13691648f2be7e686436513a7366", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man´e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org" - }, - { - "type": "NarrativeText", - "element_id": "098ca0ae774b51e7eba5dbe98641da88", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[2] Alberti, M., Pondenkandath, V., W¨ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)" - }, - { - "type": "NarrativeText", - "element_id": "0054c11c9691968349806c35f6aa5f0f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)" - }, - { - "type": "NarrativeText", - "element_id": "607a64b13da109e96c62ecaedce91c4f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)" - }, - { - "type": "UncategorizedText", - "element_id": "9409d20f2ee25336c2566bda8d8bb83c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale" - }, - { - "type": "NarrativeText", - "element_id": "44c5093519506610b07942b24d966d77", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "Hierarchical Image Database. In: CVPR09 (2009)" - }, - { - "type": "NarrativeText", - "element_id": "ad1bf75fc53d123c878f8254f9304c9f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" - }, - { - "type": "NarrativeText", - "element_id": "c6e835fe03323406543926cc0f5a94de", + "type": "ListItem", + "element_id": "af2a971baba0e022d1e53fc0e44b1d94", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" + "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, ot G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Mané, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Viégas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), software available from tensorflow.org Alberti, M., Pondenkandath, V., Wiirsch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423-428. IEEE (2018) Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296-300. IEEE (2009) Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365-9374 (2019) Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale Hierarchical Image Database. In: CVPRO9 (2009) Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980-989. PMLR (2017) Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180-1189. PMLR (2015)" }, { "type": "Title", @@ -1692,164 +1502,14 @@ "text": "15" }, { - "type": "UncategorizedText", - "element_id": "16390873ae6b6a173fc894a873bab022", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[9]" - }, - { - "type": "NarrativeText", - "element_id": "068bf90a7743f50c4a00d4827035e42f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" - }, - { - "type": "NarrativeText", - "element_id": "813cac1316043d454f3c928740435736", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" - }, - { - "type": "NarrativeText", - "element_id": "2f103adde52e35a8853cbb476720a6ef", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" - }, - { - "type": "Title", - "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" - }, - { - "type": "NarrativeText", - "element_id": "124b6b55da69fccc1c06568bda34f63c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" - }, - { - "type": "Title", - "element_id": "9b9688203e9cdea89ded788342be4032", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." - }, - { - "type": "UncategorizedText", - "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "2007(159), 2 (Jul 2007)" - }, - { - "type": "NarrativeText", - "element_id": "3e0b97d540b7b43ad61292a89a58137f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" - }, - { - "type": "NarrativeText", - "element_id": "80498c312fd32cb744e5953dfef18604", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" - }, - { - "type": "NarrativeText", - "element_id": "09cfad31b28b1315b0bc7bd219136057", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" - }, - { - "type": "NarrativeText", - "element_id": "be647bda3f1ca1b63554ef22d1313a43", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" - }, - { - "type": "NarrativeText", - "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" - }, - { - "type": "NarrativeText", - "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" - }, - { - "type": "NarrativeText", - "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" - }, - { - "type": "NarrativeText", - "element_id": "aae12b8f70e03a3e35015ebda5974ebe", + "type": "ListItem", + "element_id": "ab02ce354f7464ee1d53d58faa93745f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" + "text": "17 18 19 20 Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) Lukasz Garncarek, Powalski, R., Stanistawek, T., Topolski, B., Halama, P., Graliriski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020) Graves, A., Fernandez, $., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369-376 (2006) Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991-995. IEEE (2015) He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961-2969 (2017) He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770-778 (2016) Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J. 2007(159), 2 (Jul 2007) Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42-47. IEEE (2011) Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120-122. UIST 20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https: //doi.org/10.1145/3379350.3416143 Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055-3062. Association for Computing Machinery, New York, NY, USA (2020), Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019) Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740-755. Springer (2014) Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431-3440 (2015) Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, $., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161-168 (2011) Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7-12. IEEE (2018)" }, { "type": "UncategorizedText", @@ -1872,183 +1532,13 @@ "text": "Z. Shen et al." }, { - "type": "NarrativeText", - "element_id": "1abcfa28cce9b0f5194dec0d534f28e5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)" - }, - { - "type": "NarrativeText", - "element_id": "f7c67eae65521c3a753337d08c5a7cc3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)" - }, - { - "type": "NarrativeText", - "element_id": "4f43b2e563a35ae0208a8626f7e3280e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)" - }, - { - "type": "UncategorizedText", - "element_id": "b66713d3f2d1689f9174e1cb87429eed", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning" - }, - { - "type": "UncategorizedText", - "element_id": "10a3ff59f6157f21733e659a41031f83", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" - }, - { - "type": "NarrativeText", - "element_id": "219033258f3fff3de33bed379610c8f3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)" - }, - { - "type": "NarrativeText", - "element_id": "285ce5849d6fd9036e5d16724c024ab9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)" - }, - { - "type": "NarrativeText", - "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)" - }, - { - "type": "NarrativeText", - "element_id": "da6733a53c75743361e9edcc1d36a20c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)" - }, - { - "type": "NarrativeText", - "element_id": "385c241b43ef196663b8d30a6b8768ed", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://" - }, - { - "type": "NarrativeText", - "element_id": "d207e2724a17741e3ae1986d63cb5636", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" - }, - { - "type": "Title", - "element_id": "93d261a89a8422fb8d166e6cdf95d8f6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "github.com/facebookresearch/detectron2 (2019)" - }, - { - "type": "NarrativeText", - "element_id": "9dce913bddaa63724f5de64e539b7016", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" - }, - { - "type": "Title", - "element_id": "2625b6830768eac986cfee208c0270de", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "text and layout for document image understanding (2019)" - }, - { - "type": "Title", - "element_id": "21d399ba787aabbf69a8ca861cbcc4a3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" - }, - { - "type": "Title", - "element_id": "462753569cb801c6f858759742a93793", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" - }, - { - "type": "Title", - "element_id": "c7fc0ade487926854bb602bca85fad60", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "layout analysis." - }, - { - "type": "UncategorizedText", - "element_id": "96c49c3fbbb585f8062778e9a404b00f", + "type": "ListItem", + "element_id": "993f472d953f5d0e4054f1d4ad6fc4f0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "largest dataset ever for doc- In: 2019 International Conference on Document IEEE (Sep 2019)." + "text": "23 github. com/facebookresearch/detectron2) (2019) Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257-260. IEEE (2010) Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572-573 (2020) Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142-147. IEEE (2019) Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91-99 (2015) Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61-80 (2008) Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548-549 (2020) Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning based layout annotation. arXiv preprint arXiv:2010.01762 (2020) Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720-725. IEEE (2019) Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020) Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of text and layout for document image understanding (2019) Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (Sep 2019). https: //doi.org/10.1109/ICDAR.2019.00166" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 4baf9be5a6..a8cc14e267 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -91,7 +91,7 @@ }, { "type": "ListItem", - "element_id": "f1d5f4ed63a14db581e985bf15416cdd", + "element_id": "4f0cdff19ccd9010b64eff87ced8e0b7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -105,79 +105,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." - }, - { - "type": "ListItem", - "element_id": "c4e0168ffab999611a92e8ebd8fe48a9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022" - }, - { - "type": "NarrativeText", - "element_id": "74180a93b38b6808f8cff7439e5d16d2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." - }, - { - "type": "ListItem", - "element_id": "5e9b501fc056965a744f6598d022f31d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With" - }, - { - "type": "NarrativeText", - "element_id": "9f5a3fe548f011e304fda9067caa0824", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." + "text": "© Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000-19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way Jor a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017-19) levels of about 3.5 percent. © = The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress. © In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." }, { "type": "Title", @@ -1097,117 +1025,9 @@ }, "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, - { - "type": "NarrativeText", - "element_id": "70f05b9620aa1b7236058898e7e59192", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023." - }, - { - "type": "ListItem", - "element_id": "fd6c549473e196512c076844988f465c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6" - }, - { - "type": "NarrativeText", - "element_id": "cdcaed7d1296edd658256d603cb3828c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers." - }, - { - "type": "ListItem", - "element_id": "3be6554964c172468cceaee89294f59d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point" - }, - { - "type": "NarrativeText", - "element_id": "7e32067b6a4662d72b1244a3aac91be5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." - }, { "type": "ListItem", - "element_id": "b24771387a5318eeda21adaa49629186", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal" - }, - { - "type": "NarrativeText", - "element_id": "f8b94e8d9a593a1debae96fce2040db7", + "element_id": "becf96ae2fa1045c14996c3de7a05bb8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1221,7 +1041,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." + "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023. Growth in the ero area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers. Growth in the United Kingdom is projected to be —0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets. Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, { "type": "NarrativeText", @@ -1259,45 +1079,9 @@ }, "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2" }, - { - "type": "NarrativeText", - "element_id": "237bc02ecaaf27f074be0c466b31cc09", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "" - }, { "type": "ListItem", - "element_id": "afde979c99a73646915fe253c85c5a9c", + "element_id": "bba948699d4f21aaf5001520bb796e17", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1311,7 +1095,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" + "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to tise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024. Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Rwssia in 2022 (estimated at —2.2 percent compared with a predicted —3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgtades of 0.2 percentage point for Brazi/ and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, { "type": "UncategorizedText", @@ -1367,63 +1151,9 @@ }, "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, - { - "type": "NarrativeText", - "element_id": "e7a8e30d6d49ffbca56f87cd6883c9a0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." - }, - { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "e" - }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "" - }, - { - "type": "NarrativeText", - "element_id": "25e2f1dc031b5421b8a234945098e58b", + "element_id": "e0fc62fcfa1add3cf912fbaf3e0c9ba1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1437,7 +1167,7 @@ "filetype": "application/pdf", "page_number": 6 }, - "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." + "text": " Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." }, { "type": "Title", @@ -3420,8 +3150,8 @@ "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "NarrativeText", - "element_id": "d379a79a55cecddeed62b21eb6a0ff00", + "type": "ListItem", + "element_id": "79a6a9353dc2a500e2e50e720cf8ab7c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3435,11 +3165,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China." + "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China. e Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." }, { - "type": "ListItem", - "element_id": "2bbe57e6c291db638d3fcddca9e0199a", + "type": "NarrativeText", + "element_id": "a2f806b25a06969405637298b4c85139", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3453,11 +3183,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to" + "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" }, { - "type": "NarrativeText", - "element_id": "3f9155fad634c620bd9b820132e20935", + "type": "ListItem", + "element_id": "e9fbac47e4ed0c2d153022a284a77919", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3471,191 +3201,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." + "text": "© = China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems. e = =War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing ptice spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase. e Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. e = Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy. e = Sudden financial market repricing: A prematute easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy. © Geopolitical fragmentation: The wat in Ukraine and the related international sanctions aimed at e pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." }, { - "type": "NarrativeText", - "element_id": "a2f806b25a06969405637298b4c85139", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" - }, - { - "type": "ListItem", - "element_id": "90a90e12a4c6b8b74d3c8d20a76f22dc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital" - }, - { - "type": "NarrativeText", - "element_id": "1bbcee85386321e6e8235a64d4c34d73", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." - }, - { - "type": "ListItem", - "element_id": "42ac57e394bf7c98d908745cefce0b80", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of" - }, - { - "type": "NarrativeText", - "element_id": "fdb59d523afa92db3942dabc88d94fc4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "2d14934d52ff357c52e9ae1c38f7390e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." - }, - { - "type": "ListItem", - "element_id": "33ccff3014b460178e62d9c8021fd728", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." - }, - { - "type": "ListItem", - "element_id": "75bd22ee0ba778cc3a616ed0a9b42292", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing" - }, - { - "type": "NarrativeText", - "element_id": "810e5a86eae657e179ac8da86f317a62", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "earlier geopolitical tensions, such as those associated with the US-China trade dispute." - }, - { - "type": "Title", - "element_id": "8ae18586f23aa212e66aeb12a5638609", + "type": "Title", + "element_id": "8ae18586f23aa212e66aeb12a5638609", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3961,97 +3511,7 @@ }, { "type": "ListItem", - "element_id": "bd7674df887463bc9f05c8030a151dea", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global" - }, - { - "type": "NarrativeText", - "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." - }, - { - "type": "ListItem", - "element_id": "af6eef18ec41f4980c1a4cbb5b7d4fec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Strengthening global trade: Strengthening the global trading system would address risks associated" - }, - { - "type": "Title", - "element_id": "0695b563acde461fc2f8d9aebccf35c7", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "with" - }, - { - "type": "NarrativeText", - "element_id": "e6f343736720ae4f9bf5202294c7c9fc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." - }, - { - "type": "ListItem", - "element_id": "d6f6afcf055ed3084a0fac1093458c88", + "element_id": "8dbc8ad2da37799a3719a01d44d2e506", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4065,43 +3525,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." - }, - { - "type": "ListItem", - "element_id": "089c5759e7030e34a3b537d9e20bcd13", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly" - }, - { - "type": "NarrativeText", - "element_id": "77ac1fdd449fba59a90d978745964463", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." + "text": "e = Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. e = Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non— Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes. e — Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system. e Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks. e Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, { "type": "Title", @@ -4644,8 +4068,8 @@ "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { - "type": "NarrativeText", - "element_id": "261bebc8fb9b3ed5146d23644639bc26", + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4659,11 +4083,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." + "text": "6" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4677,11 +4101,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4695,11 +4119,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4713,7 +4137,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "3" }, { "type": "UncategorizedText", @@ -4735,7 +4159,7 @@ }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4749,11 +4173,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "1" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "6ef230728534d871e5126e2a55e12b26", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4767,11 +4191,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "UncategorizedText", + "element_id": "3e48114b7946f4dd7a12ae0b2c1121af", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4785,11 +4209,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "© ——" }, { - "type": "Title", - "element_id": "6ef230728534d871e5126e2a55e12b26", + "type": "ListItem", + "element_id": "7d4f55875c970d850a152ba1d5ba02a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4803,11 +4227,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "1. United States" }, { "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4821,11 +4245,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Latest" }, { - "type": "ListItem", - "element_id": "7d4f55875c970d850a152ba1d5ba02a5", + "type": "Title", + "element_id": "53d79cec96694df67ce3baff95d8a2e3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4839,11 +4263,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1. United States" + "text": "October 2022 GFSR" }, { - "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "type": "ListItem", + "element_id": "8e655408cf212df5f74df13e05cdf02c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4857,11 +4281,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "2. Euro area" }, { - "type": "Title", - "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", + "type": "UncategorizedText", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4875,11 +4299,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Latest" + "text": "5" }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4893,11 +4317,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "4" }, { - "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4911,11 +4335,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "3" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "UncategorizedText", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4929,11 +4353,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "2" }, { - "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4947,11 +4371,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "1" }, { - "type": "ListItem", - "element_id": "8e655408cf212df5f74df13e05cdf02c", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4965,7 +4389,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2. Euro area" + "text": "Oct. 22" }, { "type": "Title", @@ -5040,8 +4464,8 @@ "text": "Dec. 26" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5055,11 +4479,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "Oct. 22" }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "Title", + "element_id": "24a234895630131d612fc1b4605a256e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5073,11 +4497,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "Apr. 23" }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Title", + "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5091,11 +4515,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "Oct. 23" }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "Title", + "element_id": "d8478f45b9790d52201238244d0e9698", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5109,11 +4533,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "Dec. 24" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5127,7 +4551,25 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "Dec. 26" + }, + { + "type": "NarrativeText", + "element_id": "2dd1b91ebd6543b4902626a579552919", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index ef0bcde4d8..fc40f88495 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -199,7 +199,7 @@ }, { "type": "ListItem", - "element_id": "9c4387f669c689e9af0a712fd494b2d7", + "element_id": "e18242a460d9d495ea7cffee38c1e647", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -213,43 +213,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The need for harmony in the nuclear regulatory environment" - }, - { - "type": "ListItem", - "element_id": "93e7dedc9d334470067ad2de1f9ee788", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The need for a holistic safety paradigm for the whole electricity system." - }, - { - "type": "ListItem", - "element_id": "3cc3e847449fed4fa13bbd94f86e43a9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The need to create a level playing field that values reliability and energy security" + "text": "° The need to create a level playing field that values reliability and energy security ° The need for harmony in the nuclear regulatory environment ° The need for a holistic safety paradigm for the whole electricity system." }, { "type": "UncategorizedText", @@ -3439,7 +3403,7 @@ }, { "type": "NarrativeText", - "element_id": "338d3e15917414641f2b559473f168f8", + "element_id": "0ad07326f56e66781da5dbb9488eaa67", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3453,7 +3417,7 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x" + "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand”" }, { "type": "NarrativeText", @@ -3474,8 +3438,8 @@ "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." }, { - "type": "FigureCaption", - "element_id": "eeda9f9210dfe4be7e82b4385290d3ca", + "type": "Image", + "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3636,224 +3600,8 @@ "text": "i" }, { - "type": "Title", - "element_id": "5d7f49449ab22deac22d767b89549c55", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ii" - }, - { - "type": "Title", - "element_id": "f5557d4fcf727a981a3c315aca733eef", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iii" - }, - { - "type": "Title", - "element_id": "0ab306823035661bb8dba21cc2535231", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iv" - }, - { - "type": "Title", - "element_id": "d3fc2842ddfad4c8d3859f84d4439bfd", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Vv" - }, - { - "type": "Title", - "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "v" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "Title", - "element_id": "c1d2906220d1eef1b17422b7132872a8", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vii" - }, - { - "type": "NarrativeText", - "element_id": "de72de35f0092bdd3107011f3be18dc0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ – Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions – with visual modification by World Nuclear Association. International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT – with visual modifications by World Nuclear Association. International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https://www.ipcc.ch/sr15/ International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity – 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf" - }, - { - "type": "NarrativeText", - "element_id": "b6396ecd6f60e3dcca17c045c00846c1", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "viii Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" - }, - { - "type": "Title", - "element_id": "ed171375d0bf81eaa5512140c3a29b8f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ix" - }, - { - "type": "Title", - "element_id": "2d711642b726b04401627ca9fbac32f5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "x" - }, - { - "type": "UncategorizedText", - "element_id": "5897aff759a5cc8d94710101c73af296", + "type": "ListItem", + "element_id": "ffc47b19bb43cce8c23421b5c78b17b4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3867,7 +3615,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "and NRC SOARCA study 2015 International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview Ibid." + "text": "i nternational Energy Agency (20 results Nuclear Association. ii nternational iii nternational Energy Agency (20 publications/nuclear/ 8), World Energy Outloo! Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=>WORLD&year=20 =chart&dataTable=ELECTRICITYANDHEAT - with visual modifications by World Nuclear Association. 9), Nuclear Power in a CI 2018. Data accessed from https://www.iea.org/weo/ — Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the ikely to stem from the implementation of announced policy intentions — with visual modification by World 6&category=Electricity&indicator=ElecGenByFuel&mode lean Energy System. Accessed from: https://www.iea.org/ iv Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https:/Awww.ipce.ch/sr15/ Vv nternational Energy Agency (20 publications/nuclear/ vi nternational vii International Publications/PDF/P1695_web.pdf and NRC SOARCA study 2015 ix nternational x bid. 9), Nuclear Power in a CI Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs o 2015 Edition. Accessed from: https:/Awww.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf Atomic Energy Agency (2015), Technical challenges in the application and instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview lean Energy System. Accessed from: https://www.iea.org/ generating Electricity — icensing of digital Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index a0b176312b..715b8fc617 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -1549,7 +1549,7 @@ }, { "type": "NarrativeText", - "element_id": "8921c0f3c29bc04c22c9c40f4eef6613", + "element_id": "a9d31d88b0e2026dbed12c8b5536ab2b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1563,7 +1563,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3" + "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution®" }, { "type": "NarrativeText", @@ -1585,7 +1585,7 @@ }, { "type": "NarrativeText", - "element_id": "e450813fe6430d87c4caa64e4792bc74", + "element_id": "1ff44442b3a554331aaf4ffb30b7eda6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1599,25 +1599,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the" - }, - { - "type": "Title", - "element_id": "31138d5dc0c297144d27d5dbd15d5ef0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "2012 UNSCEAR report and the 2015 US NRC SOARCA study." + "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. $ Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study." }, { "type": "UncategorizedText", @@ -2178,386 +2160,8 @@ "text": "i" }, { - "type": "NarrativeText", - "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" - }, - { - "type": "Title", - "element_id": "5d7f49449ab22deac22d767b89549c55", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ii" - }, - { - "type": "Title", - "element_id": "f5557d4fcf727a981a3c315aca733eef", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iii" - }, - { - "type": "Title", - "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "v" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "NarrativeText", - "element_id": "9d45931b60fa1041a13243a1ee1bb170", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." - }, - { - "type": "NarrativeText", - "element_id": "794a96b3ab9a3e860f65549c3a106704", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" - }, - { - "type": "NarrativeText", - "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" - }, - { - "type": "NarrativeText", - "element_id": "4051afedda98549176dc28aaa9087e81", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" - }, - { - "type": "NarrativeText", - "element_id": "d85940c91ae6b53fc4b41bd5137e7371", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" - }, - { - "type": "NarrativeText", - "element_id": "9a236889bced20048d1619798291d194", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" - }, - { - "type": "NarrativeText", - "element_id": "26a84724035df76d7d8a6610a6fa4627", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" - }, - { - "type": "Title", - "element_id": "6e98dee26ce2439cd4b8af82426e894e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "understanding/statistics" - }, - { - "type": "Title", - "element_id": "759772833f6756e511150b2a49233864", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "professional/cancer-statistics/risk" - }, - { - "type": "Title", - "element_id": "86c0a0cef7faa217f386f75ead17dbec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "sheets/detail/climate-change-and-health" - }, - { - "type": "Title", - "element_id": "7267222b91f507e040c69dad9af7941f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "the-full-costs-of-electricity-provision?details=true" - }, - { - "type": "NarrativeText", - "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." - }, - { - "type": "NarrativeText", - "element_id": "e4d7c811a799c3c8e706125556f8a370", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" - }, - { - "type": "NarrativeText", - "element_id": "98e5f594de0e79990a0650489fdf295c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" - }, - { - "type": "NarrativeText", - "element_id": "d5658e2a49995a2f4ca4b45d95f2058b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" - }, - { - "type": "NarrativeText", - "element_id": "c328c06c32c00c43471cd3c9d257c68b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" - }, - { - "type": "NarrativeText", - "element_id": "6bbd046b939157389606adf4059fe1f3", + "type": "ListItem", + "element_id": "158d56841d65947a9a91a3ca34163a4c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2571,7 +2175,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" + "text": "Vi VIL xi xii World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https:/Awww.bbc.co.uk/news/ business-50953712 Slovic, P, 2010. The Psychology of risk. Sauide e Sociedade, 19(4), pp. 731-747. United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https:/Avww.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018 Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8 World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021] National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/ understanding/statistics Cancer Research UK (n.d.). Cancer risk statistics. Available at: https:/Awww.cancerresearchuk.org/health- professional/cancer-statistics/risk OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https:/Avww.oecd-nea.org/jcms/pl_14998/ the-full-costs-of-electricity-provision?details=true World Health Organization (2018). Climate change and health. Available at: https:/Awww.who.int/news-room/fact- sheets/detail/climate-change-and-health BP 2020. BP Statistical Review of World Energy, London: BP" }, { "type": "NarrativeText", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3703d5d96a..2b9078795c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev0" # pragma: no cover +__version__ = "0.10.19-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f1b03b63bd..4cfa6b044a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -329,7 +329,11 @@ def _partition_pdf_or_image_local( ocr_languages = prepare_languages_for_tesseract(languages) - model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME") + model_name = ( + model_name + if model_name + else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "detectron2_onnx") + ) pdf_image_dpi = kwargs.pop("pdf_image_dpi", None) extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False) image_output_dir_path = kwargs.get("image_output_dir_path", None) From 4d6492391c1ad6f3eefd274303b0587f6c2dd24e Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Mon, 2 Oct 2023 11:31:15 -0400 Subject: [PATCH 19/31] roman/update language kwarg partition ingest (#1605) ### Description Change partition kwarg from ocr_languages -> languages Closes out https://github.com/Unstructured-IO/unstructured/issues/1588 --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- unstructured/ingest/processor.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 264cf9f44e..a4e8771278 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev1 +## 0.10.19-dev2 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2b9078795c..bfddceeb0e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev1" # pragma: no cover +__version__ = "0.10.19-dev2" # pragma: no cover diff --git a/unstructured/ingest/processor.py b/unstructured/ingest/processor.py index a133b72732..a91fc671b5 100644 --- a/unstructured/ingest/processor.py +++ b/unstructured/ingest/processor.py @@ -107,10 +107,11 @@ def process_documents( verbose: bool, dest_doc_connector: t.Optional[BaseDestinationConnector] = None, ) -> None: + languages = partition_config.ocr_languages.split("+") if partition_config.ocr_languages else [] process_document_with_partition_args = partial( process_document, strategy=partition_config.strategy, - ocr_languages=partition_config.ocr_languages, + languages=languages, encoding=partition_config.encoding, pdf_infer_table_structure=partition_config.pdf_infer_table_structure, ) From 0abebb5fe6b415ad44ef1423997df0644d983c27 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 2 Oct 2023 12:08:26 -0400 Subject: [PATCH 20/31] fix: fix benchmark script when DOCKER_TEST=true (#1515) The home directory for our dockerfile changed and broke this script. To verify, try running the benchmark script: ``` export DOCKER_TEST=true ./scripts/performance/benchmark.sh ``` I'll pull in the latest changelog before merging. --- CHANGELOG.md | 2 +- scripts/performance/benchmark.sh | 4 ++-- unstructured/__version__.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4e8771278..1f1a1d4778 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev2 +## 0.10.19-dev3 ### Enhancements diff --git a/scripts/performance/benchmark.sh b/scripts/performance/benchmark.sh index 0cbf7f4337..878f22f4dd 100755 --- a/scripts/performance/benchmark.sh +++ b/scripts/performance/benchmark.sh @@ -44,9 +44,9 @@ if [[ "$DOCKER_TEST" == "true" ]]; then -e GIT_HASH="$GIT_HASH" \ -e SLOW_FILES="${SLOW_FILES[*]}" \ -e HI_RES_STRATEGY_FILES="${HI_RES_STRATEGY_FILES[*]}" \ - -v "${SCRIPT_DIR}":/home/scripts/performance \ + -v "${SCRIPT_DIR}":/home/notebook-user/scripts/performance \ unstructured:perf-test \ - bash /home/scripts/performance/benchmark-local.sh 2>&1 | tee >(while IFS= read -r line; do + bash /home/notebook-user/scripts/performance/benchmark-local.sh 2>&1 | tee >(while IFS= read -r line; do read_benchmark_logs_for_results done) else diff --git a/unstructured/__version__.py b/unstructured/__version__.py index bfddceeb0e..a84e0335aa 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev2" # pragma: no cover +__version__ = "0.10.19-dev3" # pragma: no cover From ed2bf7eb66a6d3e3dbd1f31590b321db6b383073 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Mon, 2 Oct 2023 10:43:41 -0700 Subject: [PATCH 21/31] build(test): ingest test fixture updates uses larger runners (#1612) --- .github/workflows/ingest-test-fixtures-update-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 7ca7d242f3..63b97fd1f4 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -37,7 +37,7 @@ jobs: make install-ci update-fixtures-and-pr: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-m env: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup] From 11cdd8d71f61e8549056baf2c7bbe687579d58ff Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:47:24 -0400 Subject: [PATCH 22/31] roman/drop downloads in ingest tests (#1614) ### Description In an effort to mitigate resource consumption when running CI tests, cleanup download dir for ingest tests after each one. --- .github/workflows/ci.yml | 1 + .github/workflows/ingest-test-fixtures-update-pr.yml | 1 + CHANGELOG.md | 2 +- test_unstructured_ingest/test-ingest-airtable-diff.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-airtable-large.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-azure.sh | 5 ++++- test_unstructured_ingest/test-ingest-biomed-api.sh | 5 ++++- test_unstructured_ingest/test-ingest-biomed-path.sh | 5 ++++- test_unstructured_ingest/test-ingest-box.sh | 9 ++++++++- .../test-ingest-confluence-diff.sh | 9 ++++++++- .../test-ingest-confluence-large.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-delta-table.sh | 4 ++++ test_unstructured_ingest/test-ingest-discord.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-dropbox.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-elasticsearch.sh | 4 ++++ test_unstructured_ingest/test-ingest-gcs.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-github.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-gitlab.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-google-drive.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-jira.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-notion.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-onedrive.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-outlook.sh | 9 ++++++++- .../test-ingest-pdf-fast-reprocess.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-s3.sh | 5 ++++- test_unstructured_ingest/test-ingest-salesforce.sh | 9 ++++++++- .../test-ingest-sharepoint-embed-cog-index.sh | 9 +++++++++ test_unstructured_ingest/test-ingest-sharepoint.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-slack.sh | 9 ++++++++- test_unstructured_ingest/test-ingest-wikipedia.sh | 10 +++++++++- test_unstructured_ingest/test-ingest.sh | 8 ++++---- unstructured/__version__.py | 2 +- unstructured/ingest/runner/box.py | 2 +- 33 files changed, 203 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5ff1e2920..9d4f066fec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -293,6 +293,7 @@ jobs: AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} TABLE_OCR: "tesseract" ENTIRE_PAGE_OCR: "tesseract" + CI: "true" run: | source .venv/bin/activate sudo apt-get update diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 63b97fd1f4..724a893128 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -91,6 +91,7 @@ jobs: TABLE_OCR: "tesseract" ENTIRE_PAGE_OCR: "tesseract" OVERWRITE_FIXTURES: "true" + CI: "true" run: | source .venv/bin/activate sudo apt-get update diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f1a1d4778..009b31a377 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev3 +## 0.10.19-dev4 ### Enhancements diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index 11727e298a..8c69a31146 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -10,11 +10,18 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=airtable-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +CI=${CI:-"false"} max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT VARIED_DATA_BASE_ID="app5YQxSfp220fWtm" VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88" diff --git a/test_unstructured_ingest/test-ingest-airtable-large.sh b/test_unstructured_ingest/test-ingest-airtable-large.sh index a5a26be1cf..b87e728187 100755 --- a/test_unstructured_ingest/test-ingest-airtable-large.sh +++ b/test_unstructured_ingest/test-ingest-airtable-large.sh @@ -12,10 +12,17 @@ OUTPUT_FOLDER_NAME=airtable-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set." diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index 38e27294d9..9fdb9dd5e5 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -11,7 +11,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ azure \ diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index 0f09757d62..bf0de6998f 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index 49d2f2f72c..b726364ef3 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k diff --git a/test_unstructured_ingest/test-ingest-box.sh b/test_unstructured_ingest/test-ingest-box.sh index 08e6803066..43a8ad38ff 100755 --- a/test_unstructured_ingest/test-ingest-box.sh +++ b/test_unstructured_ingest/test-ingest-box.sh @@ -11,10 +11,17 @@ OUTPUT_FOLDER_NAME=box OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index d785ff3a18..c9c0c21483 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -10,10 +10,17 @@ OUTPUT_FOLDER_NAME=confluence-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-confluence-large.sh b/test_unstructured_ingest/test-ingest-confluence-large.sh index 7a5114e340..c1196bdd3d 100755 --- a/test_unstructured_ingest/test-ingest-confluence-large.sh +++ b/test_unstructured_ingest/test-ingest-confluence-large.sh @@ -12,10 +12,17 @@ OUTPUT_FOLDER_NAME=confluence-large OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index d019017a7a..d4c79a8f0d 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -9,6 +9,7 @@ OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set." @@ -21,6 +22,9 @@ source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$DESTINATION_TABLE" cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index b55e37ab9f..7aedb2b352 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=discord OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$DISCORD_TOKEN" ]; then echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index e58f5c6389..b591f0cdd8 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=dropbox OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index 33a755b222..530ddf1bed 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -9,6 +9,7 @@ OUTPUT_FOLDER_NAME=elasticsearch OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh @@ -21,6 +22,9 @@ function cleanup() { fi cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 5827105dfb..dd43710941 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=gcs OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index a81be26732..4061bea956 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=github OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index d8e7ce5fe9..1a9031c7a7 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=gitlab OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ gitlab \ diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index 12d802fe48..218a5cfe0a 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=google-drive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index 3982141cb8..173fc4f94b 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -9,10 +9,17 @@ OUTPUT_FOLDER_NAME=jira-diff OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index b7e9c399f4..2a83a47bb3 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=notion OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$NOTION_API_KEY" ]; then echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 6e683351ed..290643815d 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=onedrive OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set." diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index fdc3e90bfc..384287e7ea 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=outlook OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set." diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index a17c91b806..96acee7bd3 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -9,10 +9,17 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME INPUT_PATH=$SCRIPT_DIR/download max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$INPUT_PATH" + fi +} +trap cleanup EXIT echo "REPROCESS INPUT PATH" ls "$INPUT_PATH" diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index c48941ab12..214a70ab71 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -12,7 +12,10 @@ max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" +} +trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index a9ee1a106c..04f686e1d9 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -11,10 +11,17 @@ OUTPUT_FOLDER_NAME=salesforce OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set." diff --git a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh index 5ea8b9b416..738848e008 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint-embed-cog-index.sh @@ -11,6 +11,7 @@ DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)" # The vector configs on the schema currently only exist on versions: # 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview API_VERSION=2023-07-01-Preview +CI=${CI:-"false"} if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ] ; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." @@ -27,6 +28,9 @@ if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then exit 0 fi +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + function cleanup { response_code=$(curl -s -o /dev/null -w "%{http_code}" \ "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ @@ -41,6 +45,11 @@ function cleanup { else echo "Index $DESTINATION_INDEX does not exist, nothing to delete" fi + + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi } trap cleanup EXIT diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 46fd041a8d..8eefa87a60 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=Sharepoint OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index e8974e1502..ff51d63692 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -8,10 +8,17 @@ OUTPUT_FOLDER_NAME=slack OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT if [ -z "$SLACK_TOKEN" ]; then echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set." diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index eb168aa731..1dc5e428b4 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -8,9 +8,17 @@ OUTPUT_FOLDER_NAME=wikipedia OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} + # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh -trap 'cleanup_dir "$OUTPUT_DIR"' EXIT +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} +trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ wikipedia \ diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 926821943e..56568b37f0 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -11,6 +11,10 @@ export OMP_THREAD_LIMIT=1 scripts=( 'test-ingest-s3.sh' 'test-ingest-azure.sh' +'test-ingest-biomed-api.sh' +'test-ingest-biomed-path.sh' +## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option +'test-ingest-pdf-fast-reprocess.sh' 'test-ingest-box.sh' 'test-ingest-discord.sh' 'test-ingest-dropbox.sh' @@ -18,8 +22,6 @@ scripts=( 'test-ingest-gitlab.sh' 'test-ingest-google-drive.sh' 'test-ingest-wikipedia.sh' -'test-ingest-biomed-api.sh' -'test-ingest-biomed-path.sh' 'test-ingest-local.sh' 'test-ingest-slack.sh' 'test-ingest-against-api.sh' @@ -39,8 +41,6 @@ scripts=( 'test-ingest-delta-table.sh' 'test-ingest-salesforce.sh' 'test-ingest-jira.sh' -## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option -'test-ingest-pdf-fast-reprocess.sh' 'test-ingest-sharepoint.sh' ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a84e0335aa..884933d470 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev3" # pragma: no cover +__version__ = "0.10.19-dev4" # pragma: no cover diff --git a/unstructured/ingest/runner/box.py b/unstructured/ingest/runner/box.py index 7ac9d44d7e..1856f075ca 100644 --- a/unstructured/ingest/runner/box.py +++ b/unstructured/ingest/runner/box.py @@ -23,7 +23,7 @@ def box( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) read_config.download_dir = update_download_dir_remote_url( - connector_name="azure", + connector_name="box", read_config=read_config, remote_url=remote_url, logger=logger, From 89bd2faaf759d3cb6d6cb64578e9d089ee6acbe6 Mon Sep 17 00:00:00 2001 From: unifyh <18213435+unifyh@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:17:51 +0800 Subject: [PATCH 23/31] fix: Fix various cases of HTML text missing after partition (#1587) Fix 4 cases of text missing after partition: 1. Text immediately after `` ```html missing1
hello
``` 2. Text inside container and immediately after `
` ```html
hello
missing2
``` 3. Text immediately after a text opening tag, if said tag contains `
` ```html

missing3
hello

``` 4. Text inside `` if it is the only content (different cause from case 1) ```html missing4 ``` Also fix problem causing `test_unstructured/documents/test_html.py::test_exclude_tag_types` to not work as intended. This will close GitHub Issue#1543 --- CHANGELOG.md | 5 +++ test_unstructured/documents/test_html.py | 40 ++++++++++++++++++++++-- unstructured/documents/html.py | 12 +++++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 009b31a377..8426b0fdd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ ### Fixes +* **Fix various cases of HTML text missing after partition** + Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result. + Fix: Updated code to deal with these cases. + Importance: This will ensure the correctness when partitioning HTML and Markdown documents. + ## 0.10.18 diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index d6d236f08f..02f6d6bc72 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -17,6 +17,7 @@ from unstructured.documents.html import ( HEADING_TAGS, LIST_ITEM_TAGS, + SECTION_TAGS, TABLE_TAGS, TEXT_TAGS, HTMLDocument, @@ -41,8 +42,15 @@ TAGS = TAGS.replace(">", "").split("<")[1:] -INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"] -EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS] +VOID_TAGS = "

" +VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:] + +INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS +EXCLUDED_TAGS = [ + tag + for tag in TAGS + if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"]) +] @pytest.fixture() @@ -685,3 +693,31 @@ def test_sample_doc_with_emoji(): # NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners # and the byte string representation when running locally on mac assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"] + + +def test_only_plain_text_in_body(): + raw_html = "Hello" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + + +def test_plain_text_before_anything_in_body(): + raw_html = "Hello

World

" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" + + +def test_line_break_in_container(): + raw_html = "
Hello
World
" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" + + +@pytest.mark.parametrize("tag", TEXT_TAGS) +def test_line_break_in_text_tag(tag): + raw_html = f"<{tag}>Hello
World" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 3bfdb1e680..1fbbcbcdfa 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -417,7 +417,7 @@ def _is_container_with_text(tag_elem: etree.Element) -> bool:
Please read my message!
""" - if tag_elem.tag not in SECTION_TAGS or len(tag_elem) == 0: + if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0: return False if tag_elem.text is None or tag_elem.text.strip() == "": @@ -451,6 +451,12 @@ def _has_break_tags(tag_elem: etree._Element) -> bool: # pyright: ignore[report def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]: unfurled = [] + + if tag_elem.text: + _tag_elem = etree.Element(tag_elem.tag) + _tag_elem.text = tag_elem.text + unfurled.append(_tag_elem) + children = tag_elem.getchildren() for child in children: if not _has_break_tags(child): @@ -474,13 +480,13 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool: if len(tag_elem) > max_predecessor_len + empty_elems_len: return False - if tag_elem.tag in TEXT_TAGS + HEADING_TAGS: + if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS: return True # NOTE(robinson) - This indicates that a div tag has no children. If that's the # case and the tag has text, its potential a text tag children = tag_elem.getchildren() - if tag_elem.tag in SECTION_TAGS and len(children) == 0: + if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0: return True if _has_adjacent_bulleted_spans(tag_elem, children): From 9d81971fcb5610456b578fa6b5e2e6844b3685d8 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:01:41 -0400 Subject: [PATCH 24/31] update ingest python doc (#1446) ### Description Updating the python version of the example docs to show how to run the same code that the CLI runs, but using python. Rather than copying the same command that would be run via the terminal and using the subprocess library to run it, this updates it to use the supported code exposed in the inference directory. For now only the wikipedia one has been updated to get some opinions on this before updating all other connector docs. Would close out https://github.com/Unstructured-IO/unstructured/issues/1445 --- CHANGELOG.md | 9 +- docs/source/source_connectors/airtable.rst | 80 ++++++--------- docs/source/source_connectors/azure.rst | 91 ++++++----------- docs/source/source_connectors/biomed.rst | 82 +++++++--------- docs/source/source_connectors/box.rst | 86 +++++++--------- docs/source/source_connectors/confluence.rst | 86 +++++++--------- docs/source/source_connectors/delta_table.rst | 82 ++++++---------- docs/source/source_connectors/discord.rst | 92 ++++++++--------- docs/source/source_connectors/dropbox.rst | 86 +++++++--------- .../source_connectors/elasticsearch.rst | 86 +++++++--------- docs/source/source_connectors/github.rst | 80 ++++++--------- docs/source/source_connectors/gitlab.rst | 80 ++++++--------- .../google_cloud_storage.rst | 78 ++++++--------- .../source/source_connectors/google_drive.rst | 80 +++++++-------- docs/source/source_connectors/jira.rst | 86 +++++++--------- .../source_connectors/local_connector.rst | 80 ++++++--------- docs/source/source_connectors/notion.rst | 86 +++++++--------- docs/source/source_connectors/onedrive.rst | 98 ++++++++----------- docs/source/source_connectors/outlook.rst | 94 ++++++++---------- docs/source/source_connectors/reddit.rst | 96 ++++++++---------- docs/source/source_connectors/s3.rst | 78 ++++++--------- docs/source/source_connectors/salesforce.rst | 94 ++++++++---------- docs/source/source_connectors/sharepoint.rst | 96 +++++++++--------- docs/source/source_connectors/slack.rst | 86 +++++++--------- docs/source/source_connectors/wikipedia.rst | 79 +++++++-------- examples/ingest/sharepoint/ingest.sh | 1 + unstructured/__version__.py | 2 +- unstructured/ingest/runner/airtable.py | 4 +- unstructured/ingest/runner/azure.py | 4 +- unstructured/ingest/runner/biomed.py | 8 +- unstructured/ingest/runner/box.py | 6 +- unstructured/ingest/runner/confluence.py | 2 +- unstructured/ingest/runner/delta_table.py | 2 +- unstructured/ingest/runner/discord.py | 4 +- unstructured/ingest/runner/dropbox.py | 6 +- unstructured/ingest/runner/elasticsearch.py | 4 +- unstructured/ingest/runner/fsspec.py | 4 +- unstructured/ingest/runner/gcs.py | 6 +- unstructured/ingest/runner/github.py | 6 +- unstructured/ingest/runner/gitlab.py | 6 +- unstructured/ingest/runner/google_drive.py | 6 +- unstructured/ingest/runner/jira.py | 8 +- unstructured/ingest/runner/local.py | 6 +- unstructured/ingest/runner/notion.py | 4 +- unstructured/ingest/runner/onedrive.py | 8 +- unstructured/ingest/runner/outlook.py | 12 +-- unstructured/ingest/runner/reddit.py | 8 +- unstructured/ingest/runner/s3.py | 6 +- unstructured/ingest/runner/salesforce.py | 4 +- unstructured/ingest/runner/sharepoint.py | 4 +- unstructured/ingest/runner/slack.py | 6 +- unstructured/ingest/runner/wikipedia.py | 4 +- 52 files changed, 938 insertions(+), 1274 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8426b0fdd4..3d54f444db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,15 @@ -## 0.10.19-dev4 +## 0.10.19-dev5 ### Enhancements * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. +* **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. + +## 0.10.17-dev3 + +### Enhancements + +* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. ### Features diff --git a/docs/source/source_connectors/airtable.rst b/docs/source/source_connectors/airtable.rst index 66939236d0..839ec9acff 100644 --- a/docs/source/source_connectors/airtable.rst +++ b/docs/source/source_connectors/airtable.rst @@ -29,29 +29,21 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "airtable", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN", - "--output-dir", "airtable-ingest-output" - "--num-processes", "2", - "--reprocess", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.airtable import airtable + + if __name__ == "__main__": + airtable( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="airtable-ingest-output", + num_processes=2, + ), + personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"), + ) Run via the API --------------- @@ -78,31 +70,23 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "airtable", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN", - "--output-dir", "airtable-ingest-output" - "--num-processes", "2", - "--reprocess", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.airtable import airtable + + if __name__ == "__main__": + airtable( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="airtable-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN"), + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/azure.rst b/docs/source/source_connectors/azure.rst index e78ad11e70..479f4e1d58 100644 --- a/docs/source/source_connectors/azure.rst +++ b/docs/source/source_connectors/azure.rst @@ -28,28 +28,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "azure", - "--remote-url", "abfs://container1/", - "--account-name", "azureunstructured1" - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.azure import azure + + if __name__ == "__main__": + azure( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="azure-ingest-output", + num_processes=2, + ), + remote_url="abfs://container1/", + account_name="azureunstructured1", + ) Run via the API --------------- @@ -62,43 +54,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: shell - unstructured-ingest \ - azure \ - --remote-url abfs://container1/ \ - --account-name azureunstructured1 \ - --output-dir azure-ingest-output \ - --num-processes 2 \ - --partition-by-api \ - --api-key "" - - .. tab:: Python - - .. code:: python - - import subprocess - - command = [ - "unstructured-ingest", - "azure", - "--remote-url", "abfs://container1/", - "--account-name", "azureunstructured1" - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.azure import azure + + if __name__ == "__main__": + azure( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="azure-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + remote_url="abfs://container1/", + account_name="azureunstructured1", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/biomed.rst b/docs/source/source_connectors/biomed.rst index 8cbd579c26..cec1deab6c 100644 --- a/docs/source/source_connectors/biomed.rst +++ b/docs/source/source_connectors/biomed.rst @@ -29,29 +29,21 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "biomed", - "--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf", - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - "--verbose", - "--preserve-downloads", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.biomed import biomed + + if __name__ == "__main__": + biomed( + verbose=True, + read_config=ReadConfig( + preserve_downloads=True, + ), + partition_config=PartitionConfig( + output_dir="biomed-ingest-output-path", + num_processes=2, + ), + path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf", + ) Run via the API --------------- @@ -78,31 +70,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "biomed", - "--path", "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf", - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - "--verbose", - "--preserve-downloads", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.biomed import biomed + + if __name__ == "__main__": + biomed( + verbose=True, + read_config=ReadConfig( + preserve_downloads=True, + ), + partition_config=PartitionConfig( + output_dir="biomed-ingest-output-path", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + path="oa_pdf/07/07/sbaa031.073.PMC7234218.pdf", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/box.rst b/docs/source/source_connectors/box.rst index bf42ea512c..c075566db9 100644 --- a/docs/source/source_connectors/box.rst +++ b/docs/source/source_connectors/box.rst @@ -30,30 +30,23 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "box", - "--box_app_config", "$BOX_APP_CONFIG_PATH" - "--remote-url", "box://utic-test-ingest-fixtures" - "--output-dir", "box-output" - "--num-processes", "2" - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.box import box + + if __name__ == "__main__": + box( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="box-output", + num_processes=2, + ), + box_app_config=os.getenv("BOX_APP_CONFIG_PATH"), + recursive=True, + remote_url="box://utic-test-ingest-fixtures", + ) Run via the API --------------- @@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "box", - "--box_app_config", "$BOX_APP_CONFIG_PATH" - "--remote-url", "box://utic-test-ingest-fixtures" - "--output-dir", "box-output" - "--num-processes", "2" - "--recursive", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.box import box + + if __name__ == "__main__": + box( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="box-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + box_app_config=os.getenv("BOX_APP_CONFIG_PATH"), + recursive=True, + remote_url="box://utic-test-ingest-fixtures", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/confluence.rst b/docs/source/source_connectors/confluence.rst index b9606d7c6c..83c3eda7e6 100644 --- a/docs/source/source_connectors/confluence.rst +++ b/docs/source/source_connectors/confluence.rst @@ -30,30 +30,22 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "confluence", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "https://unstructured-ingest-test.atlassian.net", - "--user-email", "12345678@unstructured.io", - "--api-token", "ABCDE1234ABDE1234ABCDE1234", - "--output-dir", "confluence-ingest-output", - "--num-processes", "2", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.confluence import confluence + + if __name__ == "__main__": + confluence( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="confluence-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + ), + url="https://unstructured-ingest-test.atlassian.net", + user_email="12345678@unstructured.io", + api_token="ABCDE1234ABDE1234ABCDE1234", + ) Run via the API --------------- @@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "confluence", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "https://unstructured-ingest-test.atlassian.net", - "--user-email", "12345678@unstructured.io", - "--api-token", "ABCDE1234ABDE1234ABCDE1234", - "--output-dir", "confluence-ingest-output", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.confluence import confluence + + if __name__ == "__main__": + confluence( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="confluence-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + url="https://unstructured-ingest-test.atlassian.net", + user_email="12345678@unstructured.io", + api_token="ABCDE1234ABDE1234ABCDE1234", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/delta_table.rst b/docs/source/source_connectors/delta_table.rst index 62b504206c..b8d18d94f4 100644 --- a/docs/source/source_connectors/delta_table.rst +++ b/docs/source/source_connectors/delta_table.rst @@ -29,30 +29,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "delta-table", - "--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/", - "--download-dir", "delta-table-ingest-download", - "--output-dir", "delta-table-example", - "--preserve-downloads", - "--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.delta_table import delta_table + + if __name__ == "__main__": + delta_table( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="delta-table-example", + num_processes=2, + ), + table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/", + storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" + ) Run via the API @@ -79,32 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "delta-table", - "--table-uri", "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/", - "--download-dir", "delta-table-ingest-download", - "--output-dir", "delta-table-example", - "--preserve-downloads", - "--storage_options", "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.delta_table import delta_table + + if __name__ == "__main__": + delta_table( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="delta-table-example", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + table_uri="s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/", + storage_options="AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/discord.rst b/docs/source/source_connectors/discord.rst index cb2c4829a4..9455b7eeb3 100644 --- a/docs/source/source_connectors/discord.rst +++ b/docs/source/source_connectors/discord.rst @@ -30,30 +30,26 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "discord", - "--channels", "12345678", - "--token", "$DISCORD_TOKEN", - "--download-dir", "discord-ingest-download", - "--output-dir", "discord-example", - "--preserve-downloads", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.discord import discord + + if __name__ == "__main__": + discord( + verbose=True, + read_config=ReadConfig( + download_dir="discord-ingest-download", + preserve_downloads=True, + ), + partition_config=PartitionConfig( + output_dir="discord-example", + num_processes=2, + ), + channels=["12345678"], + token=os.getenv("DISCORD_TOKEN"), + period=None, + ) Run via the API --------------- @@ -81,32 +77,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "discord", - "--channels", "12345678", - "--token", "$DISCORD_TOKEN", - "--download-dir", "discord-ingest-download", - "--output-dir", "discord-example", - "--preserve-downloads", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.discord import discord + + if __name__ == "__main__": + discord( + verbose=True, + read_config=ReadConfig( + download_dir="discord-ingest-download", + preserve_downloads=True, + ), + partition_config=PartitionConfig( + output_dir="discord-example", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + channels=["12345678"], + token=os.getenv("DISCORD_TOKEN"), + period=None, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/dropbox.rst b/docs/source/source_connectors/dropbox.rst index 515b23912b..f8e3d9c867 100644 --- a/docs/source/source_connectors/dropbox.rst +++ b/docs/source/source_connectors/dropbox.rst @@ -30,30 +30,23 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "dropbox", - "--remote-url", "dropbox:// /", - "--output-dir", "dropbox-output", - "--token", "$DROPBOX_TOKEN", - "--num-processes", "2", - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.dropbox import dropbox + + if __name__ == "__main__": + dropbox( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="dropbox-output", + num_processes=2, + ), + remote_url="dropbox:// /", + token=os.getenv("DROPBOX_TOKEN"), + recursive=True, + ) Run via the API --------------- @@ -81,32 +74,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "dropbox", - "--remote-url", "dropbox:// /", - "--output-dir", "dropbox-output", - "--token", "$DROPBOX_TOKEN", - "--num-processes", "2", - "--recursive", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.dropbox import dropbox + + if __name__ == "__main__": + dropbox( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="dropbox-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + remote_url="dropbox:// /", + token=os.getenv("DROPBOX_TOKEN"), + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/elasticsearch.rst b/docs/source/source_connectors/elasticsearch.rst index fd4238ab16..f8f7391ffb 100644 --- a/docs/source/source_connectors/elasticsearch.rst +++ b/docs/source/source_connectors/elasticsearch.rst @@ -30,30 +30,22 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "elasticsearch", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "http://localhost:9200", - "--index-name", "movies", - "--jq-query", "{ethnicity, director, plot}", - "--output-dir", "elasticsearch-ingest-output", - "--num-processes", "2" - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.elasticsearch import elasticsearch + + if __name__ == "__main__": + elasticsearch( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="elasticsearch-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + ), + url="http://localhost:9200", + index_name="movies", + jq_query="{ethnicity, director, plot}", + ) Run via the API --------------- @@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "elasticsearch", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "http://localhost:9200", - "--index-name", "movies", - "--jq-query", "{ethnicity, director, plot}", - "--output-dir", "elasticsearch-ingest-output", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.elasticsearch import elasticsearch + + if __name__ == "__main__": + elasticsearch( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="elasticsearch-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + url="http://localhost:9200", + index_name="movies", + jq_query="{ethnicity, director, plot}", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/github.rst b/docs/source/source_connectors/github.rst index 0b08cac62c..a8ba7c52ca 100644 --- a/docs/source/source_connectors/github.rst +++ b/docs/source/source_connectors/github.rst @@ -29,29 +29,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "github", - "--url", "Unstructured-IO/unstructured", - "--git-branch", "main", - "--output-dir", "github-ingest-output", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.github import github + + if __name__ == "__main__": + github( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="github-ingest-output", + num_processes=2, + ), + url="Unstructured-IO/unstructured", + git_branch="main", + ) Run via the API --------------- @@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "github", - "--url", "Unstructured-IO/unstructured", - "--git-branch", "main", - "--output-dir", "github-ingest-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.github import github + + if __name__ == "__main__": + github( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="github-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + url="Unstructured-IO/unstructured", + git_branch="main", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/gitlab.rst b/docs/source/source_connectors/gitlab.rst index e0f722c205..646fedb687 100644 --- a/docs/source/source_connectors/gitlab.rst +++ b/docs/source/source_connectors/gitlab.rst @@ -29,29 +29,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "gitlab", - "--url", "Unstructured-IO/unstructured", - "--git-branch", "v0.0.7", - "--output-dir", "gitlab-ingest-output", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.gitlab import gitlab + + if __name__ == "__main__": + gitlab( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="gitlab-ingest-output", + num_processes=2, + ), + url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab", + git_branch="v0.0.7", + ) Run via the API --------------- @@ -78,31 +69,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "gitlab", - "--url", "Unstructured-IO/unstructured", - "--git-branch", "v0.0.7", - "--output-dir", "gitlab-ingest-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.gitlab import gitlab + + if __name__ == "__main__": + gitlab( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="gitlab-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + url="https://gitlab.com/gitlab-com/content-sites/docsy-gitlab", + git_branch="v0.0.7", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/google_cloud_storage.rst b/docs/source/source_connectors/google_cloud_storage.rst index 96af2c968c..54e009fe8d 100644 --- a/docs/source/source_connectors/google_cloud_storage.rst +++ b/docs/source/source_connectors/google_cloud_storage.rst @@ -29,29 +29,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "gcs", - "--remote-url", "gs://utic-test-ingest-fixtures-public/", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.gcs import gcs + + if __name__ == "__main__": + gcs( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="gcs-output", + num_processes=2, + ), + remote_url="gs://utic-test-ingest-fixtures-public/", + recursive=True, + ) Run via the API --------------- @@ -76,29 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "gcs", - "--remote-url", "gs://utic-test-ingest-fixtures-public/", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.gcs import gcs + + if __name__ == "__main__": + gcs( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="gcs-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + remote_url="gs://utic-test-ingest-fixtures-public/", + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/google_drive.rst b/docs/source/source_connectors/google_drive.rst index 9ec9724285..869f680798 100644 --- a/docs/source/source_connectors/google_drive.rst +++ b/docs/source/source_connectors/google_drive.rst @@ -30,28 +30,21 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "google-drive", - "--drive-id", "", - "--service-account-key",, "Path/To/Your/Service/Account/Key" - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.google_drive import gdrive + + if __name__ == "__main__": + gdrive( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="google-drive-ingest-output", + num_processes=2, + ), + drive_id="POPULATE WITH FILE OR FOLDER ID", + service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY", + recursive=True, + ) Run via the API --------------- @@ -79,30 +72,25 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "google-drive", - "--drive-id", "", - "--service-account-key",, "Path/To/Your/Service/Account/Key" - "--output-dir", "/Output/Path/To/Files", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.google_drive import gdrive + + if __name__ == "__main__": + gdrive( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="google-drive-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + drive_id="POPULATE WITH FILE OR FOLDER ID", + service_account_key="POPULATE WITH DRIVE SERVICE ACCOUNT KEY", + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/jira.rst b/docs/source/source_connectors/jira.rst index 37b2056e62..f93bb26f40 100644 --- a/docs/source/source_connectors/jira.rst +++ b/docs/source/source_connectors/jira.rst @@ -31,30 +31,22 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "jira", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "https://unstructured-jira-connector-test.atlassian.net", - "--user-email", "12345678@unstructured.io", - "--api-token", "ABCDE1234ABDE1234ABCDE1234", - "--output-dir", "jira-ingest-output", - "--num-processes", "2", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.jira import jira + + if __name__ == "__main__": + jira( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="jira-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + ), + url="https://unstructured-jira-connector-test.atlassian.net", + user_email="12345678@unstructured.io", + api_token="ABCDE1234ABDE1234ABCDE1234", + ) Run via the API --------------- @@ -82,32 +74,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "jira", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--url", "https://unstructured-jira-connector-test.atlassian.net", - "--user-email", "12345678@unstructured.io", - "--api-token", "ABCDE1234ABDE1234ABCDE1234", - "--output-dir", "jira-ingest-output", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.jira import jira + + if __name__ == "__main__": + jira( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="jira-ingest-output", + num_processes=2, + metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"], + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + url="https://unstructured-jira-connector-test.atlassian.net", + user_email="12345678@unstructured.io", + api_token="ABCDE1234ABDE1234ABCDE1234", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/local_connector.rst b/docs/source/source_connectors/local_connector.rst index daa6645e52..b93ab589bf 100644 --- a/docs/source/source_connectors/local_connector.rst +++ b/docs/source/source_connectors/local_connector.rst @@ -23,29 +23,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "local", - "--input-path", "example-docs", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.local import local + + if __name__ == "__main__": + local( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="local-ingest-output", + num_processes=2, + ), + input_path="example-docs", + recursive=True, + ) Run via the API --------------- @@ -72,31 +63,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "local", - "--input-path", "example-docs", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--recursive", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.local import local + + if __name__ == "__main__": + local( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="local-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + input_path="example-docs", + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/notion.rst b/docs/source/source_connectors/notion.rst index a79bd2d251..3036a01924 100644 --- a/docs/source/source_connectors/notion.rst +++ b/docs/source/source_connectors/notion.rst @@ -30,30 +30,22 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "notion", - "--api-key", "", - "--output-dir", "notion-ingest-output", - "--page-ids", "", - "--database-ids", """", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.notion import notion + + if __name__ == "__main__": + notion( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="notion-ingest-output", + num_processes=2, + ), + api_key="POPULATE API KEY", + page_ids=["LIST", "OF", "PAGE", "IDS"], + database_ids=["LIST", "OF", "DATABASE", "IDS"], + recursive=False, + ) Run via the API --------------- @@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "notion", - "--api-key", "", - "--output-dir", "notion-ingest-output", - "--page-ids", "", - "--database-ids", """", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.notion import notion + + if __name__ == "__main__": + notion( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="notion-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + api_key="POPULATE API KEY", + page_ids=["LIST", "OF", "PAGE", "IDS"], + database_ids=["LIST", "OF", "DATABASE", "IDS"], + recursive=False, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/onedrive.rst b/docs/source/source_connectors/onedrive.rst index 90241d0800..592a49313d 100644 --- a/docs/source/source_connectors/onedrive.rst +++ b/docs/source/source_connectors/onedrive.rst @@ -33,33 +33,25 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "onedrive", - "--client-id", "", - "--client-cred", "", - "--authority-url", "", - "--tenant", "", - "--user-pname", "", - "--path", "", - "--output-dir", "onedrive-ingest-output", - "--num-processes", "2", - "--verbose" - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.onedrive import onedrive + + if __name__ == "__main__": + onedrive( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="onedrive-ingest-output", + num_processes=2, + ), + client_id="", + client_cred="", + authority_url="", + tenant="", + user_pname="", + path="", + recursive=False, + ) Run via the API --------------- @@ -90,35 +82,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "onedrive", - "--client-id", "", - "--client-cred", "", - "--authority-url", "", - "--tenant", "", - "--user-pname", "", - "--path", "", - "--output-dir", "onedrive-ingest-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.onedrive import onedrive + + if __name__ == "__main__": + onedrive( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="onedrive-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + client_id="", + client_cred="", + authority_url="", + tenant="", + user_pname="", + path="", + recursive=False, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/outlook.rst b/docs/source/source_connectors/outlook.rst index ce3c1f29f0..0e78738614 100644 --- a/docs/source/source_connectors/outlook.rst +++ b/docs/source/source_connectors/outlook.rst @@ -33,33 +33,26 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "outlook", - "--client-id", "$MS_CLIENT_ID", - "--client-cred", "$MS_CLIENT_CRED", - "--tenant", "", - "--user-email", "$MS_USER_EMAIL", - "--outlook-folders", "Inbox,Sent Items", - "--output-dir", "onedrive-ingest-output", - "--num-processes", "2", - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.outlook import outlook + + if __name__ == "__main__": + outlook( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="outlook-output", + num_processes=2, + ), + client_id=os.getenv("MS_CLIENT_ID"), + client_cred=os.getenv("MS_CLIENT_CRED"), + tenant=os.getenv("MS_TENANT_ID"), + user_email=os.getenv("MS_USER_EMAIL"), + outlook_folders=["Inbox", "Sent Items"], + recursive=True, + ) Run via the API --------------- @@ -86,31 +79,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "airtable", - "--metadata-exclude", "filename,file_directory,metadata.data_source.date_processed", - "--personal-access-token", "$AIRTABLE_PERSONAL_ACCESS_TOKEN", - "--output-dir", "airtable-ingest-output" - "--num-processes", "2", - "--reprocess", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.outlook import outlook + + if __name__ == "__main__": + outlook( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="outlook-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + client_id=os.getenv("MS_CLIENT_ID"), + client_cred=os.getenv("MS_CLIENT_CRED"), + tenant=os.getenv("MS_TENANT_ID"), + user_email=os.getenv("MS_USER_EMAIL"), + outlook_folders=["Inbox", "Sent Items"], + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/reddit.rst b/docs/source/source_connectors/reddit.rst index f31d0d55ba..4d7c82be82 100644 --- a/docs/source/source_connectors/reddit.rst +++ b/docs/source/source_connectors/reddit.rst @@ -33,33 +33,24 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "reddit", - "--subreddit-name", "machinelearning", - "--client-id", "", - "--client-secret", "", - "--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...", - "--search-query", "Unstructured", - "--num-posts", "10", - "--output-dir", "reddit-ingest-output", - "--num-processes", "2", - "--verbose" - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.reddit import reddit + + if __name__ == "__main__": + reddit( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="reddit-ingest-output", + num_processes=2, + ), + subreddit_name="machinelearning", + client_id="", + client_secret="", + user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...", + search_query="Unstructured", + num_posts=10, + ) Run via the API --------------- @@ -90,35 +81,28 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "reddit", - "--subreddit-name", "machinelearning", - "--client-id", "", - "--client-secret", "", - "--user-agent", "Unstructured Ingest Subreddit fetcher by \\u\\...", - "--search-query", "Unstructured", - "--num-posts", "10", - "--output-dir", "reddit-ingest-output", - "--num-processes", "2", - "--verbose" - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.reddit import reddit + + if __name__ == "__main__": + reddit( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="reddit-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + subreddit_name="machinelearning", + client_id="", + client_secret="", + user_agent=r"Unstructured Ingest Subreddit fetcher by \\u\...", + search_query="Unstructured", + num_posts=10, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/s3.rst b/docs/source/source_connectors/s3.rst index d2f16ad77f..483937a842 100644 --- a/docs/source/source_connectors/s3.rst +++ b/docs/source/source_connectors/s3.rst @@ -28,28 +28,20 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "s3", - "--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/", - "--anonymous", - "--output-dir", "s3-small-batch-output", - "--num-processes", "2" - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.s3 import s3 + + if __name__ == "__main__": + s3( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="s3-small-batch-output", + num_processes=2, + ), + remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/", + anonymous=True, + ) Run via the API --------------- @@ -75,30 +67,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "s3", - "--remote-url", "s3://utic-dev-tech-fixtures/small-pdf-set/", - "--anonymous", - "--output-dir", "s3-small-batch-output", - "--num-processes", "2", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.s3 import s3 + + if __name__ == "__main__": + s3( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="s3-small-batch-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/", + anonymous=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/salesforce.rst b/docs/source/source_connectors/salesforce.rst index 04183ed7a9..fd52ad6d6a 100644 --- a/docs/source/source_connectors/salesforce.rst +++ b/docs/source/source_connectors/salesforce.rst @@ -32,32 +32,25 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "salesforce", - "--username" "$SALESFORCE_USERNAME" - "--consumer-key" "$SALESFORCE_CONSUMER_KEY" - "--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH" - "--categories" "EmailMessage,Account,Lead,Case,Campaign" - "--output-dir" "salesforce-output" - "--num-processes", "2" - "--recursive", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.salesforce import salesforce + + if __name__ == "__main__": + salesforce( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="salesforce-output", + num_processes=2, + ), + username=os.getenv("SALESFORCE_USERNAME"), + consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"), + private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"), + categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"], + recursive=True, + ) Run via the API --------------- @@ -87,34 +80,27 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "salesforce", - "--username" "$SALESFORCE_USERNAME" - "--consumer-key" "$SALESFORCE_CONSUMER_KEY" - "--private-key-path" "$SALESFORCE_PRIVATE_KEY_PATH" - "--categories" "EmailMessage,Account,Lead,Case,Campaign" - "--output-dir" "salesforce-output" - "--num-processes", "2" - "--recursive", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.salesforce import salesforce + + if __name__ == "__main__": + salesforce( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="salesforce-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + username=os.getenv("SALESFORCE_USERNAME"), + consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"), + private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"), + categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"], + recursive=True, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/sharepoint.rst b/docs/source/source_connectors/sharepoint.rst index 67ffe626d9..bc0b144dd9 100644 --- a/docs/source/source_connectors/sharepoint.rst +++ b/docs/source/source_connectors/sharepoint.rst @@ -25,37 +25,32 @@ Run Locally --files-only "Flag to process only files within the site(s)" \ --output-dir sharepoint-ingest-output \ --num-processes 2 \ + --path "Shared Documents" \ --verbose .. tab:: Python .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "sharepoint", - "--client-id", "", - "--client-cred", "", - "--site", "", - "--files-only", "Flag to process only files within the site(s)", - "--output-dir", "sharepoint-ingest-output", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.sharepoint import sharepoint + + if __name__ == "__main__": + sharepoint( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="sharepoint-ingest-output", + num_processes=2, + ), + client_id="", + client_cred="", + site="", + # Flag to process only files within the site(s) + files_only=True, + path="Shared Documents", + recursive=False, + ) Run via the API --------------- @@ -77,6 +72,7 @@ You can also use upstream connectors with the ``unstructured`` API. For this you --output-dir sharepoint-ingest-output \ --num-processes 2 \ --verbose \ + --path "Shared Documents" \ --partition-by-api \ --api-key "" @@ -84,33 +80,29 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "sharepoint", - "--client-id", "", - "--client-cred", "", - "--site", "", - "--files-only", "Flag to process only files within the site(s)", - "--output-dir", "sharepoint-ingest-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.sharepoint import sharepoint + + if __name__ == "__main__": + sharepoint( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="sharepoint-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + client_id="", + client_cred="", + site="", + # Flag to process only files within the site(s) + files_only=True, + path="Shared Documents", + recursive=False, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/slack.rst b/docs/source/source_connectors/slack.rst index dcb4700e60..53da13a257 100644 --- a/docs/source/source_connectors/slack.rst +++ b/docs/source/source_connectors/slack.rst @@ -30,30 +30,22 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "slack", - "--channels", "12345678", - "--token", "12345678", - "--download-dir", "slack-ingest-download", - "--output-dir", "slack-ingest-output", - "--start-date", "2023-04-01T01:00:00-08:00", - "--end-date", "2023-04-02" - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.slack import slack + + if __name__ == "__main__": + slack( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="slack-ingest-download", + num_processes=2, + ), + channels=["12345678"], + token="12345678", + start_date="2023-04-01T01:00:00-08:00", + end_date="2023-04-02,", + ) Run via the API --------------- @@ -81,32 +73,26 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "slack", - "--channels", "12345678", - "--token", "12345678", - "--download-dir", "slack-ingest-download", - "--output-dir", "slack-ingest-output", - "--start-date", "2023-04-01T01:00:00-08:00", - "--end-date", "2023-04-02", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.slack import slack + + if __name__ == "__main__": + slack( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="slack-ingest-download", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + channels=["12345678"], + token="12345678", + start_date="2023-04-01T01:00:00-08:00", + end_date="2023-04-02,", + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/docs/source/source_connectors/wikipedia.rst b/docs/source/source_connectors/wikipedia.rst index 7d81160994..cf6a6af061 100644 --- a/docs/source/source_connectors/wikipedia.rst +++ b/docs/source/source_connectors/wikipedia.rst @@ -28,28 +28,21 @@ Run Locally .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "wikipedia", - "--page-title", "Open Source Software", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--verbose", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + from unstructured.ingest.runner.wikipedia import wikipedia + from unstructured.ingest.interfaces import ReadConfig, PartitionConfig + + + if __name__ == "__main__": + wikipedia( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="wikipedia-ingest-output", + num_processes=2 + ), + page_title="Open Source Software", + auto_suggest=False, + ) Run via the API --------------- @@ -75,30 +68,24 @@ You can also use upstream connectors with the ``unstructured`` API. For this you .. code:: python - import subprocess - - command = [ - "unstructured-ingest", - "wikipedia", - "--page-title", "Open Source Software", - "--output-dir", "dropbox-output", - "--num-processes", "2", - "--verbose", - "--partition-by-api", - "--api-key", "", - ] - - # Run the command - process = subprocess.Popen(command, stdout=subprocess.PIPE) - output, error = process.communicate() - - # Print output - if process.returncode == 0: - print('Command executed successfully. Output:') - print(output.decode()) - else: - print('Command failed. Error:') - print(error.decode()) + import os + + from unstructured.ingest.interfaces import PartitionConfig, ReadConfig + from unstructured.ingest.runner.wikipedia import wikipedia + + if __name__ == "__main__": + wikipedia( + verbose=True, + read_config=ReadConfig(), + partition_config=PartitionConfig( + output_dir="wikipedia-ingest-output", + num_processes=2, + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + page_title="Open Source Software", + auto_suggest=False, + ) Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here `_. diff --git a/examples/ingest/sharepoint/ingest.sh b/examples/ingest/sharepoint/ingest.sh index 4a73ca65fb..53a1218207 100644 --- a/examples/ingest/sharepoint/ingest.sh +++ b/examples/ingest/sharepoint/ingest.sh @@ -25,4 +25,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --files-only "Flag to process only files within the site(s)" \ --output-dir sharepoint-ingest-output \ --num-processes 2 \ + --path "Shared Documents" \ --verbose diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 884933d470..ea97a53bc8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev4" # pragma: no cover +__version__ = "0.10.19-dev5" # pragma: no cover diff --git a/unstructured/ingest/runner/airtable.py b/unstructured/ingest/runner/airtable.py index 92f5bd735d..48fc109b6a 100644 --- a/unstructured/ingest/runner/airtable.py +++ b/unstructured/ingest/runner/airtable.py @@ -10,11 +10,11 @@ def airtable( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, personal_access_token: str, - list_of_paths: t.Optional[str], + verbose: bool = False, + list_of_paths: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/azure.py b/unstructured/ingest/runner/azure.py index 90b08e0654..58e2594b4d 100644 --- a/unstructured/ingest/runner/azure.py +++ b/unstructured/ingest/runner/azure.py @@ -9,14 +9,14 @@ def azure( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, account_name: t.Optional[str], account_key: t.Optional[str], connection_string: t.Optional[str], remote_url: str, - recursive: bool, + verbose: bool = False, + recursive: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/biomed.py b/unstructured/ingest/runner/biomed.py index 62e6bb1671..fe23aa34ca 100644 --- a/unstructured/ingest/runner/biomed.py +++ b/unstructured/ingest/runner/biomed.py @@ -13,13 +13,13 @@ def biomed( verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, - path: t.Optional[str], - api_id: t.Optional[str], - api_from: t.Optional[str], - api_until: t.Optional[str], max_retries: int, max_request_time: int, decay: float, + path: t.Optional[str] = None, + api_id: t.Optional[str] = None, + api_from: t.Optional[str] = None, + api_until: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/box.py b/unstructured/ingest/runner/box.py index 1856f075ca..20f066dfa3 100644 --- a/unstructured/ingest/runner/box.py +++ b/unstructured/ingest/runner/box.py @@ -9,12 +9,12 @@ def box( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, remote_url: str, - recursive: bool, - box_app_config: t.Optional[str], + verbose: bool = False, + recursive: bool = False, + box_app_config: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/confluence.py b/unstructured/ingest/runner/confluence.py index 64db4233c2..5192d07dc2 100644 --- a/unstructured/ingest/runner/confluence.py +++ b/unstructured/ingest/runner/confluence.py @@ -10,7 +10,6 @@ def confluence( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, url: str, @@ -18,6 +17,7 @@ def confluence( api_token: str, max_num_of_spaces: int, max_num_of_docs_from_each_space: int, + verbose: bool = False, spaces: t.Optional[t.List[str]] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, diff --git a/unstructured/ingest/runner/delta_table.py b/unstructured/ingest/runner/delta_table.py index a547831dbd..f19a4d9c4a 100644 --- a/unstructured/ingest/runner/delta_table.py +++ b/unstructured/ingest/runner/delta_table.py @@ -11,12 +11,12 @@ def delta_table( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, table_uri: t.Union[str, Path], version: t.Optional[int] = None, storage_options: t.Optional[str] = None, + verbose: bool = False, without_files: bool = False, columns: t.Optional[t.List[str]] = None, writer_type: t.Optional[str] = None, diff --git a/unstructured/ingest/runner/discord.py b/unstructured/ingest/runner/discord.py index d5a44a5086..de1a7d4cbb 100644 --- a/unstructured/ingest/runner/discord.py +++ b/unstructured/ingest/runner/discord.py @@ -10,12 +10,12 @@ def discord( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, channels: t.List[str], token: str, - period: t.Optional[int], + verbose: bool = False, + period: t.Optional[int] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/dropbox.py b/unstructured/ingest/runner/dropbox.py index bacb3b8127..e30ab36af3 100644 --- a/unstructured/ingest/runner/dropbox.py +++ b/unstructured/ingest/runner/dropbox.py @@ -9,12 +9,12 @@ def dropbox( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, remote_url: str, - recursive: bool, - token: t.Optional[str], + verbose: bool = False, + recursive: bool = False, + token: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/elasticsearch.py b/unstructured/ingest/runner/elasticsearch.py index cd02a2f638..8c5a511576 100644 --- a/unstructured/ingest/runner/elasticsearch.py +++ b/unstructured/ingest/runner/elasticsearch.py @@ -10,12 +10,12 @@ def elasticsearch( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, url: str, index_name: str, - jq_query: t.Optional[str], + verbose: bool = False, + jq_query: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/fsspec.py b/unstructured/ingest/runner/fsspec.py index f0260af409..7822b30140 100644 --- a/unstructured/ingest/runner/fsspec.py +++ b/unstructured/ingest/runner/fsspec.py @@ -11,11 +11,11 @@ def fsspec( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, remote_url: str, - recursive: bool, + verbose: bool = False, + recursive: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/gcs.py b/unstructured/ingest/runner/gcs.py index eab4fb4bc6..a442a28916 100644 --- a/unstructured/ingest/runner/gcs.py +++ b/unstructured/ingest/runner/gcs.py @@ -9,12 +9,12 @@ def gcs( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, remote_url: str, - recursive: bool, - token: t.Optional[str], + verbose: bool = False, + recursive: bool = False, + token: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/github.py b/unstructured/ingest/runner/github.py index 4bbf09e5aa..ff726da597 100644 --- a/unstructured/ingest/runner/github.py +++ b/unstructured/ingest/runner/github.py @@ -10,13 +10,13 @@ def github( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, url: str, git_branch: str, - git_access_token: t.Optional[str], - git_file_glob: t.Optional[str], + verbose: bool = False, + git_access_token: t.Optional[str] = None, + git_file_glob: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/gitlab.py b/unstructured/ingest/runner/gitlab.py index 4d15385a98..a4e6d9b947 100644 --- a/unstructured/ingest/runner/gitlab.py +++ b/unstructured/ingest/runner/gitlab.py @@ -10,13 +10,13 @@ def gitlab( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, url: str, git_branch: str, - git_access_token: t.Optional[str], - git_file_glob: t.Optional[str], + verbose: bool = False, + git_access_token: t.Optional[str] = None, + git_file_glob: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/google_drive.py b/unstructured/ingest/runner/google_drive.py index 2f6f437086..27ad5979bd 100644 --- a/unstructured/ingest/runner/google_drive.py +++ b/unstructured/ingest/runner/google_drive.py @@ -10,13 +10,13 @@ def gdrive( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, service_account_key: str, - recursive: bool, drive_id: str, - extension: t.Optional[str], + verbose: bool = False, + recursive: bool = False, + extension: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/jira.py b/unstructured/ingest/runner/jira.py index bcecda323b..e9875e51ee 100644 --- a/unstructured/ingest/runner/jira.py +++ b/unstructured/ingest/runner/jira.py @@ -10,15 +10,15 @@ def jira( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, url: str, user_email: str, api_token: str, - projects: t.Optional[t.List[str]], - boards: t.Optional[t.List[str]], - issues: t.Optional[t.List[str]], + verbose: bool = False, + projects: t.Optional[t.List[str]] = None, + boards: t.Optional[t.List[str]] = None, + issues: t.Optional[t.List[str]] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/local.py b/unstructured/ingest/runner/local.py index 6278079324..a52ee598ec 100644 --- a/unstructured/ingest/runner/local.py +++ b/unstructured/ingest/runner/local.py @@ -8,12 +8,12 @@ def local( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, input_path: str, - recursive: bool, - file_glob: t.Optional[str], + verbose: bool = False, + recursive: bool = False, + file_glob: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/notion.py b/unstructured/ingest/runner/notion.py index 9bd10e9b03..7aa22e9c4e 100644 --- a/unstructured/ingest/runner/notion.py +++ b/unstructured/ingest/runner/notion.py @@ -11,11 +11,11 @@ def notion( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, api_key: str, - recursive: bool, + verbose: bool = False, + recursive: bool = False, page_ids: t.Optional[t.List[str]] = None, database_ids: t.Optional[t.List[str]] = None, writer_type: t.Optional[str] = None, diff --git a/unstructured/ingest/runner/onedrive.py b/unstructured/ingest/runner/onedrive.py index 3cee6b9467..abf3d18938 100644 --- a/unstructured/ingest/runner/onedrive.py +++ b/unstructured/ingest/runner/onedrive.py @@ -10,16 +10,16 @@ def onedrive( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, tenant: str, user_pname: str, client_id: str, client_cred: str, - authority_url: t.Optional[str], - path: t.Optional[str], - recursive: bool, + verbose: bool = False, + authority_url: t.Optional[str] = None, + path: t.Optional[str] = None, + recursive: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/outlook.py b/unstructured/ingest/runner/outlook.py index 3592634bd0..d0613ce340 100644 --- a/unstructured/ingest/runner/outlook.py +++ b/unstructured/ingest/runner/outlook.py @@ -10,15 +10,15 @@ def outlook( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, user_email: str, - client_id: t.Optional[str], - client_cred: t.Optional[str], - tenant: t.Optional[str], - authority_url: t.Optional[str], - recursive: bool, + verbose: bool = False, + recursive: bool = False, + client_id: t.Optional[str] = None, + client_cred: t.Optional[str] = None, + tenant: t.Optional[str] = None, + authority_url: t.Optional[str] = None, outlook_folders: t.Optional[t.List[str]] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, diff --git a/unstructured/ingest/runner/reddit.py b/unstructured/ingest/runner/reddit.py index 2003723789..fea56f1f12 100644 --- a/unstructured/ingest/runner/reddit.py +++ b/unstructured/ingest/runner/reddit.py @@ -10,15 +10,15 @@ def reddit( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, subreddit_name: str, - client_id: t.Optional[str], - client_secret: t.Optional[str], user_agent: str, - search_query: t.Optional[str], num_posts: int, + verbose: bool = False, + client_id: t.Optional[str] = None, + client_secret: t.Optional[str] = None, + search_query: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/s3.py b/unstructured/ingest/runner/s3.py index 45f27ce43d..292270e50a 100644 --- a/unstructured/ingest/runner/s3.py +++ b/unstructured/ingest/runner/s3.py @@ -9,12 +9,12 @@ def s3( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, remote_url: str, - recursive: bool, - anonymous: bool, + verbose: bool = False, + recursive: bool = False, + anonymous: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/salesforce.py b/unstructured/ingest/runner/salesforce.py index ad0f050ed5..415d9be79b 100644 --- a/unstructured/ingest/runner/salesforce.py +++ b/unstructured/ingest/runner/salesforce.py @@ -10,14 +10,14 @@ def salesforce( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, - recursive: bool, username: str, consumer_key: str, private_key_path: str, categories: t.List[str], + verbose: bool = False, + recursive: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index d5ab2ec940..1781e2cbd9 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -14,9 +14,9 @@ def run( site: str, client_id: str, client_cred: str, - files_only: bool, path: str, - recursive: bool, + files_only: bool = False, + recursive: bool = False, **kwargs, ): writer_kwargs = self.writer_kwargs if self.writer_kwargs else {} diff --git a/unstructured/ingest/runner/slack.py b/unstructured/ingest/runner/slack.py index d2c61e9faf..0b9919c216 100644 --- a/unstructured/ingest/runner/slack.py +++ b/unstructured/ingest/runner/slack.py @@ -10,13 +10,13 @@ def slack( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, channels: t.List[str], token: str, - start_date: t.Optional[str], - end_date: t.Optional[str], + verbose: bool = False, + start_date: t.Optional[str] = None, + end_date: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, diff --git a/unstructured/ingest/runner/wikipedia.py b/unstructured/ingest/runner/wikipedia.py index 44a031cd60..8914042cad 100644 --- a/unstructured/ingest/runner/wikipedia.py +++ b/unstructured/ingest/runner/wikipedia.py @@ -10,11 +10,11 @@ def wikipedia( - verbose: bool, read_config: ReadConfig, partition_config: PartitionConfig, page_title: str, - auto_suggest: bool, + verbose: bool = False, + auto_suggest: bool = False, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, From b2e997635fef2bd1855c7fc136b0393660f0e4d3 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:39:33 -0400 Subject: [PATCH 25/31] roman/es ingest test fixes (#1610) ### Description update elasticsearch docker setup to use docker-compose Would close out https://github.com/Unstructured-IO/unstructured/issues/1609 --- .github/workflows/ci.yml | 4 ++ .../ingest-test-fixtures-update-pr.yml | 6 ++- CHANGELOG.md | 2 +- .../create-and-check-es.sh | 39 ++++--------------- .../create_and_fill_es.py | 2 + .../docker-compose.yaml | 15 +++++++ .../test-ingest-elasticsearch.sh | 6 +-- unstructured/__version__.py | 2 +- 8 files changed, 38 insertions(+), 38 deletions(-) mode change 100644 => 100755 scripts/elasticsearch-test-helpers/create_and_fill_es.py create mode 100644 scripts/elasticsearch-test-helpers/docker-compose.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9d4f066fec..e475ec9f02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,6 +255,10 @@ jobs: source .venv/bin/activate mkdir "$NLTK_DATA" make install-ci + - name: Setup docker-compose + uses: KengoTODA/actions-setup-docker-compose@v1 + with: + version: '2.22.0' - name: Test Ingest (unit) run: | source .venv/bin/activate diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 724a893128..499a1f7593 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -9,7 +9,7 @@ env: jobs: setup: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-m if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'push' && contains(github.event.head_commit.message, 'ingest-test-fixtures-update')) @@ -56,6 +56,10 @@ jobs: source .venv/bin/activate mkdir "$NLTK_DATA" make install-ci + - name: Setup docker-compose + uses: KengoTODA/actions-setup-docker-compose@v1 + with: + version: '2.22.0' - name: Update test fixtures env: AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d54f444db..7324722691 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev5 +## 0.10.19-dev6 ### Enhancements diff --git a/scripts/elasticsearch-test-helpers/create-and-check-es.sh b/scripts/elasticsearch-test-helpers/create-and-check-es.sh index 44fca2f7d3..dc06c21a16 100755 --- a/scripts/elasticsearch-test-helpers/create-and-check-es.sh +++ b/scripts/elasticsearch-test-helpers/create-and-check-es.sh @@ -1,37 +1,14 @@ #!/usr/bin/env bash -SCRIPT_DIR=$(dirname "$(realpath "$0")") - -# Create the Elasticsearch cluster and get the container id -docker run -d --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" --name es-test docker.elastic.co/elasticsearch/elasticsearch:8.7.0 +set -e -# Wait for Elasticsearch container to start -echo "Waiting for Elasticsearch container to start..." -sleep 1 - -url="http://localhost:9200/_cluster/health?wait_for_status=green&timeout=50s" -status_code=0 -retry_count=0 -max_retries=6 +SCRIPT_DIR=$(dirname "$(realpath "$0")") -# Check the cluster status repeatedly until it becomes live or maximum retries are reached -while [ "$status_code" -ne 200 ] && [ "$retry_count" -lt "$max_retries" ]; do - # Send a GET request to the cluster health API - response=$(curl -s -o /dev/null -w "%{http_code}" "$url") - status_code="$response" +# Create the Elasticsearch cluster +docker compose version +docker compose -f "$SCRIPT_DIR"/docker-compose.yaml up --wait +docker compose -f "$SCRIPT_DIR"/docker-compose.yaml ps - # Process the files only when the Elasticsearch cluster is live - if [ "$status_code" -eq 200 ]; then - echo "Cluster is live." - python "$SCRIPT_DIR/create_and_fill_es.py" - else - ((retry_count++)) - echo "Cluster is not available. Retrying in 5 seconds... (Attempt $retry_count)" - sleep 5 - fi -done -# If the cluster has not become live, exit after a certain number of tries -if [ "$status_code" -ne 200 ]; then - echo "Cluster took an unusually long time to create (>25 seconds). Expected time is around 10 seconds. Exiting." -fi +echo "Cluster is live." +"$SCRIPT_DIR"/create_and_fill_es.py diff --git a/scripts/elasticsearch-test-helpers/create_and_fill_es.py b/scripts/elasticsearch-test-helpers/create_and_fill_es.py old mode 100644 new mode 100755 index 796e2187a8..a761255741 --- a/scripts/elasticsearch-test-helpers/create_and_fill_es.py +++ b/scripts/elasticsearch-test-helpers/create_and_fill_es.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import pandas as pd from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk diff --git a/scripts/elasticsearch-test-helpers/docker-compose.yaml b/scripts/elasticsearch-test-helpers/docker-compose.yaml new file mode 100644 index 0000000000..47cb93ae1f --- /dev/null +++ b/scripts/elasticsearch-test-helpers/docker-compose.yaml @@ -0,0 +1,15 @@ +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0 + container_name: es-test + ports: + - 9200:9200 + - 9300:9300 + environment: + - xpack.security.enabled=false + - discovery.type=single-node + healthcheck: + test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"] + interval: 30s + timeout: 30s + retries: 3 diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index 530ddf1bed..7b181f90ba 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -16,10 +16,8 @@ source "$SCRIPT_DIR"/cleanup.sh function cleanup() { # Kill the container so the script can be repeatedly run using the same ports - if docker ps --filter "name=es-test"; then - echo "Stopping Elasticsearch Docker container" - docker stop es-test - fi + echo "Stopping Elasticsearch Docker container" + docker-compose -f scripts/elasticsearch-test-helpers/docker-compose.yaml down --remove-orphans -v cleanup_dir "$OUTPUT_DIR" if [ "$CI" == "true" ]; then diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ea97a53bc8..5f8fd628c9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev5" # pragma: no cover +__version__ = "0.10.19-dev6" # pragma: no cover From d6efd52b4b1a2cb0df649a126ed507fb9f8d470f Mon Sep 17 00:00:00 2001 From: Klaijan Date: Tue, 3 Oct 2023 11:25:20 -0400 Subject: [PATCH 26/31] fix: isalnum referenced before assignment (#1586) **Executive Summary** Fix bug on the `get_word_bounding_box_from_element` function that prevent `partition_pdf` to run. **Technical Details** - The function originally first define `isalnum` on the first index. Now switched to conditional on flag value. --- CHANGELOG.md | 7 +- example-docs/interface-config-guide-p93.pdf | Bin 0 -> 104337 bytes .../partition/pdf-image/test_pdf.py | 14 +- unstructured/partition/pdf.py | 131 +++++++++++++++++- 4 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 example-docs/interface-config-guide-p93.pdf diff --git a/CHANGELOG.md b/CHANGELOG.md index 7324722691..a37fea2809 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. +### Features + +### Fixes + +* **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable. Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf. + ## 0.10.17-dev3 ### Enhancements @@ -20,7 +26,6 @@ Fix: Updated code to deal with these cases. Importance: This will ensure the correctness when partitioning HTML and Markdown documents. - ## 0.10.18 ### Enhancements diff --git a/example-docs/interface-config-guide-p93.pdf b/example-docs/interface-config-guide-p93.pdf new file mode 100644 index 0000000000000000000000000000000000000000..db41a7cae4a9ae68e4d4445e61c754cb8b96c10f GIT binary patch literal 104337 zcmcF~1y~);k~R_`Kmx%fA-D!T2iM^45Zv9}-QC^YU4mP1cL?qY8r=0ikT3UsyLaW^ zeVzq|>FKWSuBxf2uI{&+Sc*@8ijEoxN8J76^v8a|S>||mFB}k$7SBT47>=DCk4C`A z)K5Z z2Zr(!uJgC?5IP?%`(CITOnt5TW%wx$&ATV9QCPiv83nB363e7-^y2cnNLUdlNH>9E z4-0PJ9~Ry&e^BGHi#c0nKCim_wSB0u>|xcnzB2Ciqsh|%ebw!L^5NmDN`1-wI&VAl z_f~hOn_HEDJ`FCrr^83aH}3bI?Z-pMNKXm(eTdhaf=g+WP*-QP%&PAlyYfQZdyl{| zjw#Yz=XYE?`PqLEn0vljUD`d>ch(_|QHfJH_U(ILlogi0i8iS^C28%`8?ofc6#?Vc z&y$&T3L|`dz5au1d7rD;3i@^i~mnwYI=+uj+PFKOn*7JS!{;bieZTmLwNS6_U6A zP@oJa63$#@YNfp^&>ipYMW@l;5$qm<;_aaA{&eJV4K|m4yh?Nkz*O7rqi*2rwadZX zV?Nh8UHFV^u;caXYVDkyALH~L3E(u#r0F!u4;AJUy1wEVDYZJ^U^0Aw&`=;kI|tbD!FiP7{(W~x9fY=Z~+ttL;PfGsR75(K^@@y@vRd)VPU6qy-&%SR(B6L zfNS3T*haEj41`2(A##&5u{?U@Ys#dd4H2P%R0D`pzIFJz6E5Nt-r;uF(vGh^QwP0M zWZ_|$QK=px3c+WDbcCOAQ@PZfiaNQSLUvWG9?N&{lkN}ir$Pp3|!EjeUv5A1S1j&D^z{}!k$D;O|1?%I4+-Yx%8aBz4hs4Qq94A zFr2D*&#rb=*5aleqU4t!aEEV}xHQVLdfF^2>nn5dy*sJ1){gqI4_! z#y@;qW17%YAx5+XW|n!c)nZ(*)Cqxm^}WfIrj^eE@gsey_`XZSwAo++2FmGl-1b(LI^A>hQ zO!Sq`6XcU0wV&Q9@)*>~RxTk{>$*3+9DF-s(KRzeBvj-8tb|9;(#>{5&62(`x)(Rq z7PttKOjE3*w;0C`)7u4v+BrrdbSvYJd^HC!Qb{e~Gel~6+~}(m*77FYPMNxw-JK#yalc=*2n@T^>sHn=X9jHsAa#vfNix{_MNt%av;Rn1=0OCT4K0B zJ3twXc?Q%XynfV20_8*rB=HUbQCW6-wgJMY{5+#K*7&R@{i~IXtC=gthOo;b$+#guKh~;mKRl=u>FXQ8pdoInvSGLL<*hO_z@ND<8v;#4phm|GP<%_z!#UdO zVlgks_fFnEC#=%3v?;2_PlrjZfh1x0&B>fNhEdmCiq)C?3avk}7{qR4Kl%@x)0Bl6 zn>y-Jm3KU~$3|a~fT{5dVZu{oI$Xu;C?zZClw~{4MoDvX8plE@#uOh-+jgs>ORmDq ztOlAxu8rbwE*L4|ORMibQDW&`DJWL|Ksch=D5x4*gKW|)#+`3^!>QU$jBxzceT&6V zzUfWsEkhJv;S?W}Kk=A!$?R9Ud}b~P5o~dt@4b=|N7PudsTtJHE4|a}6*5&b0t!WQ ziLCdo6QYdH6V`P8HO@%J$ns=LBjMScU$@P|PWPc=Lvs+?wG+z_8cdN`GFv_$5ib&` zVQLamO=tNJA@6m1st3;2(2bHqQH~VRhh4uRoB!}NJzL)X98;gcbwQvK917KEwgk8V z)I_6bIU}RaBy*k-u%vUYm79UBzy*v%>xsN<+s-;qP+L&M#llF>ug=9Ra%FXS4G@gC*zwRa)0ief3LBUi|cr_eF-Hduf1gtx2kaezS!s1ts@zeln8$ zY3uC=f<`lzKZ3={U2?D#r~GgOD4GbWQSH4Nd1n$(t);)&v828JV$ynGoF`n(sY0A5 zDQ}imQu>-ktbt)Y4xUIl+P;_rtstY*4N2OCBJI;6ZT>1@HB*dg7gVZkjaLVJ ze2tMf`VM7qI}J|7(DBH88@c_G9hJ(6lHc{(-={&)5MaAEV{8q?>l`k0arNOe@AWRW z(AiatjqoZ}T{Q~S`xPtiz#)|KxO(!zK`FX((1zGR^dE2t5HigWseZiHWmpILaU0AB z*_eK_m?25Hohznv1@c;nD~s&;Dp3xt!dB_13Y|_3DZLI(ng6!WBSrqk&Gx;qOs(+V z`%#(PQA4C$)N>@kn~3_*UO|*$II=BG7J z84Pg5Fth8G>?g=fd&$>ikBi(F&}$mS$TFB5n(OEI4d8qe@vBvs+q8(YndCRjVE4ox zaH`hhk%mcoK6%JV?>rBLBO(l&Wm2Z84Zc<_D9Ni-!8lL>M_<1KEv(l=tXO8Oxmm7&t4AG8^R`+oQgS90qB8`gv91_W`6EZsXChBs5^XS?PYq9LM&^$S!$Lmeu_+g_+ozs#Dd3rCTAnb`7tz0QDS9h}0uBAwv2fu-)X@D3IZFM9z*|x9`d%jN)eCYJ2Efc& zKOs7VErJ*xdzAht?HF|sYddf{Y>2tgEL9ryV={eOnFwLI0=*eI5)|6UsXVxhx?R_4 ze2K(8!zp@+3{8(>75e1fA8f<@-P}1wwQPtHN%YAd$T0Kqt5~yjf;e?EgJz;YK7#on zbh*-pqS;E=vb8f>+P*aW71|klOJAm>oLuJPmv8Jj8D9CkH(lAR(bA3|nHY*E-;-{R z=XGvGlC({nFQLDiQ?uxm`bicdf%C>sM9Z(xEPPCBS?^2hKttxb)JEXQ5M05*nPPXJ zyAS`5>7A=3%Xf0e??oRPHb0XZhmIn^6Yn>>pq`%QI>*YpeyFwOo{m45NZF% zotV=i7%sz`K_}#LVD&vn`K_O6Q?VQQ=5deMbufW9o32KV6H?gfw)#hH;o8P{c1kRb&`%o>!5KorWh z#}SPEMa@;Er@1fRxd7UzB&%|lw)kXvPrCn3u}f+)h)xK`6lppL+r@1uiMFK8A~m8@ zc!GnZoZ-amLV77>ZRT5&l|XO`3bnIa-RP?^#P4pan0XMhO4R;Q0%_U6EK*LL&+E{B zh1i30zhJ@rC*4*J$;_pt&NvgU-vCj(3cw4&f6N+f*!8?Zz?Xpxwo<0f47PGVvh_w8 zs#;gRa9a$d!8_X2L;`82?`{cp;SR)$VPRwgY z-YRI(E4R(h1b~yGHVd%{q-Jnx*VXuJLS$ToweKAk?}aS6z*teSrE{`qCpZe{r-Nn1 zY&G6}6~`4cg-x_gTR;yXtS&md>-~WCV~u%4>PxCNsZ!yF z3d;{^bZqI(K$-;gwXQdac^G*->U#?dU(H<7`m+x6DUL<#v{lShC<5k(1}D|xAF7nY zptvb(qcZHgqNb0g^-QS~s(#r~QKAhhXw}IXb(Q3HC(Kh9CM4G~A?$4#j;|BqW>S)s+kbP$+&^Q2NK!LpZz`Pd{{i z`~f*zvN%Um`0pJ_8;*VkPG!d`a<0+7KeO6u4$UTG0o+BjW_2XVTnqa}+Ex1ato4hH zv-tCJ2zp|4Tc*?cy4lsaKU;P#U`LRerm-J!wO4asqXRYuOhxGEI2dF4%Y^B49ygPc z2XUZ!+EWzUVCKKn7ATWp zuRaJztd0^(87y}AWiHyneGPhOu^vn|*5RgykNf_kmOXqGl=Nr&9TU=0+pO8?j1U~@ zsrOZu6K_&Xx!!|kqUsQ&Y*j0#=VoXt0o+bkwI3mUM^EVX1+#^$Vgr8B<>Qc2a(}yt zP0=O;^Nr! z4WEk&yIGM=2_{DyJXx?45=inim3ZwWRi;RPN4vtTs93Kxq)^f!=lT`;Rd@U$j0)i= zC69vCSS$>BU?qyvbrob-Az{#4H&~8bjq!v`^-_L*p_bubvdo?s>zKD3ZEYH4!!ora z9(DRfBSaE1V3^H^(pDa$Nput$fD;n;KvP|GTr3v|;$Pt~WArTN4jOe@$WHz|fpA(3 zuswTD&bJ`9=0_`4H_BSB z3>zVU5CKmPiNyNA4}&plkp7E)tIJ(9!4AFJcbbvA=mv=~e3^Y8j8i$B)9xJV@LdgY zzh}Y*33ny+@iQ2aPaRWGmFkt$#s=Ypp%*(WwN5O?9?Y^t)jX%#LjE2O6qEnzIMGl=JcDpNE% z8MJ;EUEF&J7X<8Oz+Y~4+^NKDgh}V3AVSx+S8yh>pX$O9ae{DW4yHQtGd7=}P}?oP zt@38c94fW6p8_MJnDzRm^zHrph!&xFYVENNleVU~o$_>`)fCv=XvmNJy1^wiEjAY-V%Js5)%yVm8~~0ar?b*9D=k=f`UVvwb1+WcKaKW zL}(LLZjcJ~@7qnprjc|c^LY;R5odFlcfIF09$YQ~a>FYJ)tROn5c;YGt?OV*xTrJq z4mFYo6GMsThO^Y$Q!N9VDMWMdrHZoTn^QqrCFRgH&9{A*8U2zU8R=YVhIZ&!GErnd z;At;5t`>ZrHM?>DRmq^!yXUUc8Q`Wg>Z~wtlp=|(%Nk_yaB0(bLsVKZATzv4uE!hT z=?k@gHi>+gSqv87-R{2I_Q*J*mBXD!z>G48hTt+!9WsonEHY9q0iU*QB^^jE-;*?_ z#AEtVP|-+qm(FGOm?;fYD<^26s6nNByrLVdY$T~9RECw1NROilW zvX!mSH+q=P+;5=A1D9v9npCe>Bvr&dp4$$A6*BW0sir7W{d9PO6qeJecTg4$x4F3mh@S4bT;_Sv{VxWd z8~n3YFJ*0^BdceNrvl=FPXLcbPS4R6PYsWT*TU4oTGmoaM-T6LNuSq-4v+Dr8Bh|` z(_bxu*7!kOfhgc;1nKY?eiH}+@R`S^&^9gGl}(@Zs-x!!IQTIXMLk&a0JO zKLGHY@tj0{5Ny&VhCoeDQ-3Fv)>cZI2-E8|0f9IYA1bXTJ!()kK`@lG)Mx$?Jnqjy z-^jAFPset6qc7U^A84LZ16m_ z0A96!A^L(G00qXkDY&6@AV-3Gw6&@D1zEIX4AxR#*%}+Hk_LCG;r{I@9T&8r^r~e% zSf6Vbv4#!AJ28!Do%Fw`Q z)xkYuA9{*#pg-F(fvLH9SV{MR?@&X2z9k2yrJR&~6rs%m2M>h6jsy?>jAjVVq4{Rm z3#}4zv;*?m6YL!vBP2;6l&~i};2UElJgT?06f}Z2wiIIWXLv&xgU?LNh}1qNYv>K| zy&XhrURcM-l->@SP~LcyeEx-zZ*pH1@is-$b$goP8T7-S2JmFPCWGGiygSZl$RzhB zj+a3SavI{{RhkbqFHC=gn4u-SY}Ozg zaaEz1It$jQuK}*`AAOvTMIB+mUn52F#UgR|dNC7l3SlCLSw`UqA;|>M6Z{gQ0R%4b zI`%`Hg8OkxOWVeN9EAJf564F*ON%Q8W>0m|h1nYU(5U2cglC_biOBuD&(ofyG7azWDAn- zp6cH3Cf|TT_RHn9Ci07!6uuK86-p376t?Kst%jZt{~#bictltqSrM5Ly&59Lf6l+e zuSnoT@RdM~AcvrW?<9LK`!YL)--hoK|5)$}lG?kj5TihrVE*7=?@G|vNEz|#h$PUM zkgt#=k!g@D(Ax1i1;5GAlZ~RH=XqL(TKfUS`3sBX1Sw*gQWp4HMbfgg43p^{l-bJ? zFA2>O>&UpHXro?7_eU4THQ;@V;Ui)oS|&yz79}1N3Kw;cm5HoQ&=&m_XPkf?ryL(2 zQx~HziXrSE6d~*pn>EBdMAUzfup9405E4BTbrCZX%R%5GZ_gc0VnEy%`ze}@@Ji@o z{P(Dc2|GE06fZfThb2|D#Eo+?Qz^`K(i9Ml52EL-VaC^c_$ zNy%J=37DOb!P7zW!QH{PgI$B4D3U3>C~hclC?w@zi{H!`&cMyM&+s!4Ggy2zGd44h zm_FHibD?;_yXUhvgq@824cmpjiQy-G9>X$&4gDfLu1VBa)2}kM*k2D#bZW5{U>5?u z9_@9o=Cf9@er0vA)lvg4GcR{F5t@vfP|jM;(w8TV6YkFM>+O8q72g%zbJ^YAci3+S zNC961=PYOqk9)?aX(uWNEGGlCx3pp{rcKTZs>11}8fLE_l~0uuzB-w>%(gL3FleQG zpmWj`VYpyi)>*K)ogZEbHJLSMv?$mAsHJYe`7M*q6^l4@AKefhCWJlY3>}wpgmQ{< zOz}}G&mc~7#+adA!Uo4hZK}!01Z`hvAKE1S?J`N{&%(E3U*;Jn>n1Lf(u#40%ID2z z)fYKe9%COq!SKSA^A>bf=j3hL_lZgb#+1lLO;rMWSjHQBZ;C>KMMFhXhO$eFOv?9( znQBuC7TxP%9iSZYT|(}%0~-76Bfur2x1q2 zBbvv+Gf3N#+sND6ueI)7AB-Q;@7~{7oZ&8gUar*R-S>9@M*zq26Sx*Q8z>pY85I^KDt#zzCmkXt_!Ix9Xgp4dH^W$+enO2wPFc=S z&bD@`4np^E-$L(~Ug~a^-kI*!aEFkJ2n~s2VdkjHWX#mLC|}WFA=bVuOJrt96|{PN zGaTVracZ%jd5ZZT#FVz2ty5&Prp1dmjOZUFoypWbJv5&ey-5jO2)#<;PBN(Ju3=W< zooAhQsWGrowGQ9TwoYn%Sd6XzwZQmw+1;fJ!BB8I{E>a8#k(abt(;Brkl;XM&W>%B zww?Xy*Wuc7D-;|Q8+?Ad%|m&aRBogSb5))?UDj+r*u}=!i%}tkxj2-sYXzRR$#n%@}zG=bBq)iMP76+{xuyyblwF zIK@695<(vChP;#fgcX7iYM`qAz5~9QhYi5lYcX?fwwW_+*bl!+ zOK(PZa1ophI}ck3S8v<4Jz8+Px~uPB;%>_PREc=*Xk$3)x};0v4!qRs-wwioyUN~?rw9g z!>1wsBfXW3`;KecYr5;r1CgJk2L;7r3mGo$j_yt^$+b-s(-1sVc>nI-Ih1dk=Qzf@ zTL>PF&pYt1-MpT3G}iVx5H#N6LGW9eLU?+rg0+}KN~uA53eZ2fP_V|n%ZmE;a%c>cR@k;*8uuC5d9rL(g^eN@@U!U>EgY_iZXcr zj2)kY(LX`11RmXg5IYK5YdQTRcmx8O8UFj=F;4+TRaRkcAFFhr(Qb9tjfdLp}!zXZ*TRXXI$UT+!Av_@tRJc6WPV;=&CV4PBo=}ARxrKOJp?%^ zH8plVo^TtVP+Q-l`U)Q=dE+6^A%p#a$AL6dCj{p(D^wYuRL2y*%UUA{DNXau={ zw_CGkaCt=3kfjMRvG3kEq76&qs>5sRB2mY_*3;(vq}uyQ%kS*;)REFb!Qo3PC6{dK z9paamkVmfe7F8A_rkfhieD%l}OehP5%66U(c>F4wHC|VBLjp~UjtBmYSj-OY*V3lB zezM45f)MEaHqk7eY2ZxygPqafFko$;q6H%zk$D=Vi^#~)Dq+`lQt!#E%h1-u22v9h zJ2K&)yfxmXBFuqXrU>;D+vr1H2T^i)rEUhk`-b|Omj~Pj$G5&AGKD_$6MQhTFw30q z5*Zp)jJ-E?u`?qL0HG2`lTFKiK>VPmXT&ER<5QhfB zNg?#tfL*zMAidjyvkrrg?o{L)^lfacwB{S-ftcfNj9wzf#hxd0IFvsa#+}`P!}b|z z7qG^}$a2TT08b8XLXFn0q(!{_h8NeRh`u(VyyPW(NG73GHVJK=Iz7o$N{svEHEL7r zz7SVj9hA=qvK*XFt{L1aEU&k?nD0+PK9t8-Dp?>#L?T0YIF<0p{Ejg)F0>>HCzKxrr8OTH#}+ z6;J9thCQ@B-o3Zo-d8l!62>%{F4K3%OE-pN1s@_)k$qGLV2Ky6G0PK$t0N(!2IND3 za?|&V!J;|A1=z6%T7dIOBD4_N{DOFcnHEy@2H)X5hXjD9hy(so{*sd}y&^L54gN)p z>5Yr)BMqtU^i6+kDIiM3U?$-$Y~J8~R3i#?m=l)IVA?#){KJ9T!NUQ@K|nAW*=b)} z&tur~zzR5_(d90C*BwVRp;4wt1rR~mcjIv&MrA(xWYmKfY z(SW2{=v)=ZZ}}RLzz{Pg&Jw_`izC~H>j*NSEC@YRALHUt2KQhT`OW&1%r6D#kXyNTO1*&KI0*dJrA(JowXr7ei1D0 zwn8DaLrv#?%p^jS>*8k5GWCAMwwf$Ui$H#1vbQ4;`+Sk2SlxARsJP<_vCDY&C-iG^ ze(}65M8e?_oS?JCe(_a1k=o}0^E=upKg^4rOVN0ho3{lgQO6xHE9=?O8 z%8(LVvm^Tvo)rgt?6OmYpVdMg<1r z=t1&JwMBjnOauDoX4)sIi+zo24pgj+Y+E zAZVYDiSBbtfw7c$k%XKEMFMMBGJl$2S?;XVevx3|cBb)_l*XhXJ>;4n#xZeNym*SF zio~J_+h;cZW)T;@_qkKMy|$!HTpaPEIT(|lc2jolc1d@??{J)df5YGp)s=?SD`X&q z`6Cp&SG^aum!VguH!?UrxGcCrq%`<*9-m&`Aa&fAxFHG-3f|&{;>+U4V&UTXVo^D! zV#eZh+183%9&zyzkr6>Eb@9?VMYEg*?yqOY%f=SdcGEaBJ~IY|Lo*76M_;(|+yx)} z{QdpY1R@2R1hRVJ!l1)|*oX}3K)a-JF>2Wk)$UR~M!E{R45MPh1!HS^1EYB3S;LRB zEv54ndD(geXr(<<%7yYJGud{cCQ>R2i{flw*d#$K$?Cu07g||uaoEzNaJ(e^MkUgK ztP!?WY9`B?YHf!GP0hAryO%o?=LZ)J&hySQ&Z|3WlkdmBPh{j~WNVaalz6DN4}mdN z0@tv&uuJKgbuFjX%CeI1`=ksThAStIlh;4xSP5_o@Z?bBKnkP@ScEC{X7#f4mV`xM zbJ609eGuOe|CwY8WHFfJlCVf!WqxcZwhHK7X2?rqO(sbkWm+~`&|)>zH%B)oVOZ3) z?=4+j>R|6@Uvy2ofj#4dR|^4yuj*k9i9wfCyro3=l<+mmE}lWt$QVnDynac`X~4RF zaD`&Ncxlgb2GT^{#A07;-)`1y7LpO2QPynG^m2abt9;c(9ooEkrCGh3o`FSzasBjR zp;|?26vy|rY4?l=H<-5pp92~Z>=2lGr+Qr?#<1G$?$HE;a{_Yut7cECQF;cZgb!+G zc8h^C^LQ*+NlVkmIoCNBebV4p>s`!Q z&7;hJn3tJ*G_WTzXrGIX-I|DA2#z1s4$ zd?4b!?B3~-?Y)Y~D9O zMyW@^6O0KI2@(sKh2OF)&?%!Yv&I1!toIg*u5?Uwi0A}$q$!!~%?^B+?;0AdR{N$l zHaAQ*J;W_V$wUn$r^Qp#8kkP>CS!Bsgs-A;qJIpSktdS_Ww2$`^Zawmg?}Y8YAtvh zVN8ilb{mcCo}b5_-=2Tl&6@fyd6cwHmt);Kl5`0SF;Gv`1AKCrJ(*9gajHQykOIm8 zFZEkCdrtQpeQX+RKympR%o)>h2Aub+{a0>09_!fan_^GH3`wm@k4h#rw|rjN*wXFH ziY-d_>}&)sE|(>#bE(TLxYn{Zh+A~0fKz26*60rd^R8$28}}tLq(*tQCgn#<7d9}u z($%*RrJDNOHd!C&sNu+pshl}z^j6lje`yO_2?ps5R7dyaY2|5*k^Vf} zu8&iSry}21?@+4L!LPF%?X2UU4WU54phVDUF$mg=P9^4ws~P$p`*fqUXHbTwj~4Rj zqua;Fn>G>sywcNInfAQJcD%NC9#%&4-FhkUtrf^}4~iq@m0XohsLM9}oLDF_Ntvyhl`p%qc{sShFeaHXUCnIbZp$*! zIq&@`&Y1GW%(-5vwz0-upgaCLzlhxDT3TCan8TWKy3juzS*+Jz+H0a}Sau=38a->~cL+G7@(*#F zfh>bzKorNFeK5IiN3eCW$2mgSJ-lW)*>;Jzj!1u$IUZ_G57uCCFLN?IX~_ECdZnnL zxEe<~m2u7<$BDC7aW8&XPc`---R;U{{0Ikf3feh+DBa^)=LUWgqxmWI++b02X{{^4 zi6%f@zLn9vbvx`T<+8rW0vB<_MbM-Eq~gTmYg>e~#}n^L0OAugEaJ8HzfR$v0fc{< z!u`e%{xOAn1_l1i;Qon|*!*Y6$1`SO2LdBNQ$0~5T^kUh@f%!F`y28>_h;((jA{IC zlUK`D%hbZ)Hwa_%7y9xyu;T9r8-CY#fmNOX2%Bfvgzh&MLZhgpjK{!8&By{mM`#(T zSs3Z?%qWN{%8@T<=>#(KUMjngFof}PRsuaJpNlP1O777znL;UkcAou zVEVf$1Aaq#G$1VlUkvNN&@w$8$aX;%Ne^J627qjto{5 zZT_p{eX00|y?+P({@qi)7}g)U|2r}Ma-M%34*MU7@m%o_d;f(PFK_M--T#>wz`q>N z|5%KdihtPqFU0t}>-aqgz2NTuw4LWn5aQ2o4mxaSWDXk5gmpoC1&nN+sDxfdDjg8? zuVrrgTnjWZ0$;o%$e^CrPJg$={rto8iQC#p%T!zrG&owwn;X5H+ynmhYyUL~FlKS=P)hJ6$ zZmqv-WKI&>RjSj0Izk4*R7277mEB>uk!%<9g-O$B{a4Yw*wIZnqnz8S#X*npCeda& zwj|dUje1)av98j~JCUMg#3bwG*=FbKxv_$Ce3R8#&azG0FNp2x;ZouCdN*m*MhkLk z3}Jgyct6@|hrVA%Pb0|UBDT8X*5Bm}JL8_5wGk2H47?eMZ)%kMp7(2VL1J6(m&pyo z-Pox~G5y7p>m`Fhn*Y^$(?HB;S*AiwJg@u$U3@P_8Mw&LDkK6znP(p^UcrWjPXdW( z*`Q{0#5v>rcEUJ}NY zG$HvU@IzQ`f2OqhsFyDj_1K>Ws-BEFjCH%B1JSH9!Uh~Qyus37*C`F_W zL7S=wUq>}^CZ@OoKVIX~$kYgDA&3PPfgakzYpr}rCT05Ljtm|nSvj1$?Ry1=;*&=f z)doH>FKWj?edeoV;-I6nCCC8|QKB5%NO{MUhMpj zrT&NC4X-fYv;9Ap0YLutKm8m)bh1u1wt8m5=K2;dhg@a!42(by*9nh|8?>uO4-{=l zKF4#Y1OByWc+Y>J_CQ_yZtYKJ&-^}DRl{SVhhw6DNk8)g(ia0L!hSZ8XKQ#-@V}bH z-(JDXE0NGL1MM^V_l^;CsM+9oPu2^`?*W>M5yTJ`5WoP6v_bOF(E?dOzEv1BuQ1Z# zHa9T+y&1~}%J71&3;jdt1Fxio{T@O zF|K|Z8(vKCeZ1qUZa#8bZE*sg>9?J=8t{RjN)Bl|c3V!$!8cs^QJB#%?vG1!oq-vAtYGQZ{4?v4E z0e--yqy8{}=z4EPS#|82T~D?Adq&PP;U1fY6`|K|l{y#Mez|-77{1q>Jr{%}=;o?9 z!wL>ar@`U(L7ys_=Ej?#s$zKF8 zLQFR-uu;NWNYzQMA*RG|Ndd8o13lOMLQm*nTg}2$PH!GAFp-Hqo;$ssYc@w|8HD*k?dmv#!@nZi~(AN*iH(>QdH z7^$FPMI`sGXn!BQ7*hy}^dQQO>LaankMMgXfsZ6GyN0atWHH;@1=|skczbh!GPndT z{hY4O{fGn2)8rY$TuEHQ1iAjH5m&n;?b)r`9;;ywBRfo3t@NSv8>7ajgr~q$Xz@v0 z_!7MWb`i!clT(W*Ff3u^@r8nuEx~gS-~E$DZdd-~Xs(2Y37V_Q6%sCDk8EjWQG>}+ zS}{c`R|QqUgF-7J&X57?^D7^+2e;&`yH^FrP1nBXZo8lI8lDVS!jObxEMkr#ab+=9 z=()d7rR*%8({%D_c^#Xrk8RZF8sx3cp>LGY5SH|(ZQX5o6h?&SU`&Ya0-BP{^RC7N zc93qNIjF_n{e1n&Z>&deL-{0NvhjL2epZxesD8kKtnY6768G@Q?&)bu@P`E<1~{%c z-~JKdYEKFdUPX+<@r~zw(%>}r@->kmmRZ*K4I7Tu7*2_dtg%V+=~L(j;KQc}mXb)cdyIuFI~kY70?f05=S)giX;Da^zKdaIO!~o{0GK zzUQ&uI>oxS-xuD8-n-rC-Q;f6F6)wBes6Qnu4p=XaGz{1dwP82cv!ySiXsTrLT!~2 zNgluTyDsf;Tty6B*VY7 zYlD;8YX(jy;TVxTYjP^R+@eW{U5ef2$-c?HQ`eh>(}3XoQWBJFfp|BtVQ2_ z!+I%v3%OC`(QIJ9FnL&;JHMkGGgT_FM8VXKX*Ca~5qjY(A}P{Iq6gp9NcHaK2RLF2 zxzuy#b(_miZ#ndlnfc;V1^dt>Mj3aKCV3tl%p|eJ2iU zq2Y{}ZKuBfoJyS*OOroD3t3KBANBR-{Q!wm`T2WiC?uo!+}xZ@^OPQG8LUwz&*EVi z@x_37w|BOh-pn}UTAHJN9UpsD&kw7-%&P{>tDB0P`$q{{>~gU@%X z_tc}S{^+umb&s|+c;Bkayz_1LUO)Ew%}wNzczRa-rH1HiB=T_T)vn(u>bjkih%<~CHc7_X8D!`%%PIaO`9t_H$;i#Gn^lPBlFwSW1PL+l{d6rq3<}rT zs}iQK#@^X)7X+Fdx8X*HoJwws774`vVCxq z25(jw5`BO}hq~viIJ9a`MG*hGh^mAPC1OlMT2M=By?dD&E0ON7VrieI8tCiCi@ab1 zOC-($l%)Qat`ewXutgtfv+J=%Kb+e8S)!OUS!n7wxqC~Oz(H3AFu}X6Mvrlv786|- zv;>PFxqXa)&Rg7vBm#j_5&D> zr*+m=@-n?G!Vk}P$?Wd+pAV>8w5gL*=~*y@H>6Ps^GRq_Q&a3FmIh+9_4LZROss7; zvbTP)M6)w(4;)kK5>${Bhz-v>G6el#X?fd7Sk;_gZ{rdnL2WU{j6sS)lgh^8_IQ72jM0i$^{m=fM zl)+2BoL;YcY2>ujom3!`QoKnxu?mJn@DW1fXb6)Xe<5k)*3LXiExDaP;I8|D3o#DP&=M4m=fe!oXH>30CkN ziMH58ug&$v&N_p_)$4&RYz{A!rifvlRcccljgUi{PdbpUf)U)*ef;J_{5B)|uNv9w z0x-WO&8Ciz9-b>Z1S`*_QO%}#^)1G9Y?*iRZDgl%a2#Lka6fJ7ZQ9UZdo;Fow{V%+ zTQBgR{UjlAsKcnVUq}U(58svYPFOT3cv?RO+4LlO(nG`2MXf@~jAk%JLSfjXv5@;! zbVfN_MT{VnZjsZ|5f{{F7<^RuT3y|ldgH+{9m6${zESY453VBWeFi!Gw5k1JDX@IZ zFug`Ll@W$WFp<$f-Jrm}mm@QIe((&QRFgdkSWihuC#Xydl)b*(a_OtmFrhTa9I`Y( z&_|Lq4@09K>&N`WJ{`>EpVnoP@2PKZFCh#%_E6_QeK26r$euJFlLq6P;FrMmT@3mw z5UGIF<2YQ=eDLzi`5b`N-SI8ihoqVLMn@(j*}|E?U-=&}*-pY|mxjem01i$hO7W@9 zuX3tg*sbD6j51ye;cXvEOqyZ*V%({lKl}O7b0K<;(cti-7aYpsnAx(Lmx~P|Q3r)% zafgf&!BDg&hAuY$_XDH@c8>#3?tXN4g$>wyXK6%Y_K{&0yrK(k)75;Sl?K(_qQX;D3DYPK8}!eaB*tdwqrd5Bri5Ph7G z(g;x|QNSLZSZ#Y;AA2zjZ~`rKn5AN$q1TOY96g-tV+(IxT}wq)87YoOxftm!m2tB{ z#f-L+skgFJZu5F@TX2gx;ek#hL;y81X6%E2C$mKZ5%4>?twhS(H@YZN?=iYQ7=AS; zo%3Ny9;Om9r_`}X7$RYc!>3CU^;FIYUDh>j`C0??l0Us!f|^Ew5^~xL1(hnGR?1N5aNeH@-@qOMjIlre3L%2#y{M z{INX>A02~12y^GP8{VrKM~q%ytt!lDgt!W2S7l-zbj(7j@Asj&9f|I@7#&GJ)TW)Okku= zWxsYXqN%wpix%QNa!Ts_MM)%FfP}={%h?D@X~w8KQAuLaLOjNLnbA`QT$wqeNYDfd z5`}_TFXXn&O0>5o+Cf9;MZlL>A}8aVSh_H0Bo{KeF$?wMI}5}gr-hxf@ho9z6U*5^ z&z3G+l4LX)X_#NU_gNbyw2sW4n!De3ds3EwrD3y$nc2=uMaw|* zXIT%~QaOB4z}ies2#v!{_VV=Nd7Lj`pjhY0sq@jj22d}?p2jQ!EjEXTKYu=ZG*(DK zI(rN$=ueXYVCL;k(QUKl4$_f1^K-sw7hf7f#||gvUk6U?_c9?Ab}R9H*tX6i#Qr=X z>d#@vG^J7ZP&g!-U*)F(Efs6LL$MklXeUvJLmQ`v|%7TZG($ z^m{%f;CFD_*!G1f@p2xpO$P+8@ z`;w5S;`kjV==&7I-Q4EC2eqv-DSxeiPOi4+&HhQFPWUGI4S3@ zm<58^jMHj*1C?mR-9zu)eFtmBskXIE(@&k4p`TuxX_~H|AkFM$8@-Lx>uVfYN7uH+ zx>cu0Nsm~+r-Mo)+?=UzF0@f0qGC&q@%hFs6!&iLKQV2L=YV+AKW~@5d|~^2{l5OM zdj9YN{e=GP5%6E&j-#wiIW=LS(oB;_Ao$p762reDXIl%79hJF z4xU-D0M!CC3$WP>kJ~N4W-mNtw*Z^n<>KYoEkL#a#R60d&`iJ>oAu3gE^|KYeAf93 zr+lG!q3c_c=%a$2I2F+*YYx#yoKBbPRZ$9vqJ#kwoG!^G9uf}03MtT?hDIa_S;(uJ z)FUk2FUy)yT-u=Z>ChdywCxbpAs@in@7@^2ZA`bM#Qc%vv47dNpYTH8n)@)!Z9;b?Sjnbqht z(v8mYIL^ZxS%yq2<)!QsMr-TZHd?^OaY+`$y9SPl@}!zTm0;#{@~ZuWu#hMoa!J&n7ylA@xxk4-LzN0Lh7 zE;l8%a8Le|ZjVQ3;Q-4SuFYhQ)E=oN%_o{q;5GUV>xVNV>taR&D_v>v@Z+@TrsB@x z`~6S*f8u=G`G=TY^@rWnQPHmz`TYJ@-JXEkA8>nIJ;IZQpH*$Rce(|)+hY{bLNoSz zB>JX-SMH%2&W|wi*v@Tx!r1=T<18L zKT5>>Q92HPt)Mc%853J#h7%mo<#)S*A4OV^Si85QsK^tS0Cji7T^=8&L>^9wnlq@1 z{eUm1qn6R7vB6;ds4=4((#dkgR_QM*Dig=lk0M0@o02R~xBRf^p^lrnU)X-h_Nr&^ z65i?CKl7G*dZ~TGou_`@N4MymcN~8F;XN~(gTmjR@4tF}|LI@agH1eFd_^)R4;KdbRj{f6Gz|zj=&h841OGEfGfKd zhJI(3bP6RYEhvf60uFp;L>vx4UZS%UqC1`J$kGOi94ynAAt_shZH{f8U$|w}<_HUu z{TCKp5SbX8t4{rWG$?+R}#Z#(|r zi4MHn8Tb_%f?uH__!U~Oa{N*$QFT>NCul%Q3lUYg%s3+M&#F60z1IBnS`^!B64;pf4Rq2f4(kdXUgyIlc}a6rN+fIS_YP-U7k48?RGFe(`I73l(Ssxs}@ z8OnKepqx!W{Oq&WZ@!>ScoUrW?B^GB_2=L1@9+P~ z3lHw6-|2tg^OJBKt@v#DZJ64RgGVxdqJx~^oW4krOYC-=jfheSlz&YvaS*$WbxuI{ zHQ45e7bLWVOVFY&Nfj)1Y4xY7Iawze?yQ=i36!2-+yz~Gnj!-P<9{ zKnFnG9mGyCjhP%FFge1Uz39+-FJ@`s_qbH?^>@Y0*S{5Skq^Gm-~4>P>jfrTRpkF6 zRLc*M5WSN1&4dKRhyDSI=NJOphrqTWurdVJ^1WTjbOYvBuTcSDON0{YbZJy1L0!#w zG$j}p9Ui@$lv9__k^y{H-})enu)ZzN`ch7sx7jDFliO`;Y+GzQY!b00Y&&f^Td(a6 z8-&gLXPm7AU~^h%0q~D*x0?fBaHfrvFZc)m_!5~Fmggk+4k&;PauB~h&gp>dps<33 z>6qP13s`RsN88#{$Mh3Td_?qPrx-D2W8rP|HG1po^^N7sADa=sj3Bv#O)7dBnqQFF^J zU3>TXGgT#zJ*uCxcM?=%esV5s_hafc}Dw+yP9w-maohg9xF; z0F_r-QqpL6+YY)Uo>UZyG=%J4r#B!1b3Ad`7I0`z5TF7h15aTE&k6*6$efPYo|8&l1CXHOlfK^)wz}6gI$y0M}nY#A38p zE0E&T``B2hGlC3jLJeaYa>0{9VNGynFc-`RrJxY75G7!dtbm0vffRZhkLhUMNpuQH zkT=LtC=uq6o0HB6vGx((Dn{%W%;`Ha89~+x2$O7H3Vcq^D4IHJC>nt76;J>W7xoHc zK1dOpfEZpT-812S;wHnbxNRx7;*3$3-4mlE1HtV^0$NHTtT{BIiL$3htn2QOZD=zj zwM=6(ll~dM-r9Xi%!cQmrA5_ND(O(f}F;MN=dv(Xc61wHnqjEL|iGaR+l*JMG#~;#(M!67qj;* zj?;ZKe!0D(o(l0SBL{`&n7qg5CS@bzW}~iCO7HW>B}WMOc5;Bc{8xt8l?@q? zkN?Q$FaT3n2k;3eo&b6cwX(V)=c2A?DXzLw=>~S*UjiAdl$Hh$<-ayr;=XLKUbhq# z-NG5&L?^?a)hld9MRh}|t=TryCfY3QvO%C=WGIuhB^b8Y!VI5n=qDUe3J6D>laCST-T$BrQnM^^PYwBCu#MD7h6Glh~BPDe0x|mT+ z5QMS@VIDh(Tvq^kf!EOOB^^|z)^K&`M6c5Yzo=`HpL0~np~W~y#UOAd;A>oPSsSFd z`r5X&KrI?lKyZjxCy?5_0b3cD=V)2F66{qGFFRQC*@3_Ie@+8`{4D|_zCNz)xpmR@ zzITO-oa3^$Z+@0$Lr-+mQYs<}p{oD={(tL>wA*T3NShDra0;T{NJYgF}>~7x?|Aj znomHTvot!Q|6;5Ay2G$8w5@}AVL#ERqKb;6M;x=KL*ZRNYLqpXi^gCwhit5aZJ5Zn zK7P*f=1VU-=iGD8x-3v4r5{^+-uS00r#82*>w}5Kr05@^4}Z4z;%UDkF%DMla5Zew>4W+k;iKVvSW4If?m*B7icXcFOLMuM?lZ``dk9%~50LeE z)?E?iR6NWnx`Wel2dCwZ0b1_hRMSz;dJ?IpOj_>Xv>ey}Hfgzo({fy&HlQIYEe_5u z95nBsjv3I0dW|Tf>*14OVNG~vI2Y~>OJPx{FA7?uJ!s8Ni1em`BFKmPbhp=QQgLgD zzdVDSYyUycy+g>k#K)J_?)y*nFe3y3i%(28IU#Uy9)GGyN8%YbaF5O0CY_n9I4W2) zKB#zA&92!rQPI<0#T}y_%~zoH)i*Nn82lk-s4guC91&$GSao|q@Y(DHh7`zs5hwSE zE8qm5VG0QT<5#`c{@5&C>#kmT-ukDd^aHylubDpTy1wFTSMecE3Bx zd7(-S4@-sHC7u z4lhglV(F-w9!ArI42$}tQ6tj`-xG*y_@>zHF(H-;x`vljrf5nI%51g(nUxl~WVl*V z5)`@SlmW$W!>@2^0|y8-)}AmwF=KeD!tHX%W%1&Y7{W^sU82xbMLE)-w8Vx-QKHat zca%m?AQq4_SWGwp3^qY$!A-x0?w|_op*h1jyd>fGpWDJJMpKR&E_TJ5f;YH8Jwx$S zhY$$WLY945HBE-mVT|!InhF53pH((2I;zZ?rDfK|ImQmBSWkKJ11tun0}B?44UX5OewS5!hQ zZ(yW57zFX=b21PLN}(X4e;5<3NYCHzTJZDhRzEj;*8H>jJ1$y&#r6OA))W7}O+M&( z;o00{jpOJ$bGKY`+gIQDS^qyDqQBFxyz`QC*H4;!MKZK7GxmuktAD(B`D-`2@3`y6 zOJ~;CudF(2@6}hmzJ9}B84rMHFCIrctR^gCv-(A1x5lPBE1<6@z~lr;o#a0;MI69( z3q}q|Bm+4H4Oezc=0Xlk_aVkPBP832W>=`95m~j%RFEr}HYL|)-aDedcLZt<XCZrf(>RKG(WRPWKACy#4CB>Qc zZ6+}^RW^LV1{n;ip)j3QybpVIf;OEpJd9gt%$NkB0DZyIeC&et3QrYe+35iG)xMWO z-{5y7b0kA*o12lwY?EU(o86vLwScN>L=*%tumGiK2n2rBZilx*v1zJEs9fu$&T_k9 z80r?-b9!iOuOV-d1sRZ`CIo|)J3jdh6G~1*BYkasZPDKjT!5&r}PK?9X~pj zDh+2oJCG? zFi>bgXW)}-5N0jq<~5|VmYO!-u~~Ht6Fg?BJJg+OPVH4cP*18hqL!*_Fr<$bl8&l* zRV#(r4uPkF2owFGd~czI>a2J@C5o&_nqo`IM0!-(DdnVI>8PaiN+%_OND1i;TuBi5 zOz{>#LH3cjP$r?Dg%&Bm6cXI~gbTR<`niG!;A;c+6x^F(|4)T>9jqA4B?a=zH4uhc zOb1yvl!986)I7)bOPZ`OZyK;CdW6d_{?YYCB2?CG-QC^N-(G+Ht0E~4e4%7k{oj|B|qYF?xarBV&<8$8s*EbmM)5{cxCJ8WU-lhj>gvor3* zbT)Qm48{iQEso3}oH_E=E1d1>hucQ6ADgFBRVOWtPxMa=&Gyd@wfoyc-xj_tKJ0o@ ze=_Q{yCT{OVY#?MzRJ1AwZ-*4=U#Q6w%6$lI&X9SMG)QP3p}emH+V!3ypqou={i{P zXOVU^YX{j0)5A%k5|76LRn2N+98KpYTMJP=7D+ceSx<$Vt0C_482asuY$X4gIeCzc zR5&uFpe4|w4R?m4Glfs302F2o1x_NxNMtF*bftwty2tT6h6FUqNEtmpUSvt!MHUM$ zGFiA$mMyAy-A0)-czpio(C3w?WG~#`!q7w?(O9h6XJ2xji1A6fP*Y4v5b%7dr~dE%!~p zKLHwMDYgyVdOX0egW%_4)9&KWfADVq|E&A#?JxYXbXVktdE1_Q@|G2M(OX0NU#G=X zd!7n6?s_b?vg0RjzWwq|7y;;`R=0e!3V=dZ)pFe;tJ{LRv!1~mJ}VVD>=ukcK@O{V z18rvVg4sMM!W>Xw_Zp~bkTf%;kdtXt1$SN{htEpoRSo2@4d!5eh(Y35fT6KNTP&2J zSt4YD_xZmn_;u6&EID}I` z-UK`1MsK$pvOjM08nq1a!W4yT%?q!)6Z$w+JmAYNCwfk3al$2vl@4AIOypz~msR3r z?xZ97+eh@bmb<~sspn$fO50-l6~0Z%HMR}*1Ij_qKHtBTuhc4s zw~AD`D&19{N?&bY92x8ThW$4CgW?0ur|C1oGY$|pvQIha{<-wF@{W33I_~+{cS`wE zjXTi2N{f>pb$)d7qlX`T1pz9ixjmAPc9A2KIkug{l()mxTrZj5uv zfqIdqw}%WEL!K^>z{@1>{xr{Qfts7Pe` zskXLoU)x%InYh;${%AnPW0D^G;MEJJLS~6Vmu#IrRP-xIfQ1vVa;sZEY;)^P?8jjC zo0cn`Z944c&Fyw2oC)DYxaTN-?))3wq|TFoj%K=!nai$qTCQOr{{ax2H+I|V;KxIH z+3ab#`itgSg|VAQciR%?O&@NS484ic_jwYmLdeu_?6K9cYW9$E0&`ZQdhV=dHS)6A zJ~TxMm!MJO9=`Un8l(4>Vx6p9M&muI{$A>i{$LZ*3cGYh{4> z_uqmitb0rl=5nw<#Dw9bSSiwJ{f7=d+bq>Td*IR0XYbq9-+k!WVZVc5{Ox1jSA{G4 z9{lAIVd+=z3Y+(S{W^@F8T z=l5}D_l@%bq}&(8FG$RG%#W{%uTwX=H+gQ=ws{_KJ?rW59Cv^0(cMmG!s`iO{&+o3 z)rS>t(V(VawTsIshgCHgibhI8KY|2nN%SF*)*?2uLm^UD&RnS^9LC(Rm%zF(=tL~B zxF~o)N&Ym5JecR+Y)RVvh+_Hn0hne207s1o>l1~O8l^2!v8G~6g;-G@wtTT+i?4+T ze6ivGr!UqtT@Y|WsCPa2|KZLA0AE<1*yLHy47K80o>;w&dBHQ0V__!W3`|V!kA{@zalgA2f1T=)JTZA79Aat2*@SY5Ylp75BPnV9$K4W!9MW(!gv8|g)JBg zhWtr!ga9*t(u-tnv`AHC*QMedd5(6evqM}WFVQ-jS|~2rym5yk0Bjo!%n3jy zu>|daWy>=3Y@i|rfmvvYabq_;!JinQ{7@6nx&f5|bF7;o2y+zkx8MMG7_$^6NFxPK zt%_7b9b9)d9ZqS{cF2;0m{9ydjK)|V2XjXOX7W83zA(Q`?Uu=sNHHeB0#BT77RUla z1CCsKA$TRg$OA;>eq7>YVmJ$S!O=j~rKA}{0F8&2%N{ELyj;X@jEpjuR3U!D#6i=J z4Tlu)xowDvz{F}qSpjR;wh@MrhB{lGtzIr)u1XdGjqP@Lxg_pzNWeiBxKWVAfo*<$ zER1Cc1!BfA;tw#<+YA(%^yKY7{R0hN`?otj=s$5_&(_=abltjj4;DpM-gR~VpZbpc z?Iv16U9bJ}wO{}Am#;$lz8QMsQApomx}V|H0TM+=s&Hxx34f50vXe%{7ih@%X@%V% zbW(rN0qczyF$JVPc!tj-I0Pkv7D^cLY%moLG3P{-dp<(k=MnNT4Kl=h9`H;9;_5Z7 z3*L?p_jcgA;JFSV_y|Og*_R;|C@uLA?G4e;j3^uSAais?Pez3`(Vfv;G#{0iPHB#m z(;{3LF*amf%(KoO;_hn}-jh(@P>-q-5>1_-0j5h!<&9FI^K&QqY15O%;!|+V9oOoN z$QgP*%%N;>|96)LcXq(DbpjRuGf+YP6e2z031hiHc&76Eg65}X>qg|gAWfi$eBqhS|-)s*4F7zT^Yn}2)R6Ek&3 zx5InoMHk(5R`(;_=dGGKdcAOOU)P-@r(QJs?rlONw>Ghuu!!7?HK*0WJcb9RxVhg^ ziU_-u7ycv`YlHYY#v>Vp0dUf&Vm%S|a!Z{TR!#`qVQ)>wQB~q`m%3-VMYlUZWgL+Rf3Nk-y8fdo)HGO;F?D?O!l047Gt@xKPN=kI09bJ8keN|x z5lEI!8;N5`J_lH}#pxYG@;T@@!+U1Hds7xEN?DkmGI75vn_3Ua1(cJ8 zpLq=k#XB-)jF}l=#=;&82wR{?17VM#OoceJLvV2&B}oW`WeJ6en~(_M0sxU}jqRmW za=5ENoA!2Kqz3{U17MLLR~Y1i$-g$khMAEq9@2P@FuI|V5iXqJ8V^F=VbN!4%EK>; zI5AuELU8lK+1Je)L&qFiweM?cd*$vE*Ix7Y$De;!_~la@Htu{(-oVEHcThQ9$S~}F5kbKN zIy;L}FctX$rwqJnNE~VoEAmi>A_=M@C@v^X9td4Ri>DUS3DZ!Sf*RZJ@lg+i3CL6L z8?z#f^E~%U_uC(GKkVt1dzD_>FFmTq2sR>CRxNTx_0e>^<3@Uy!(QvVL~6CQI_A0` zpbu&fI`#`a&YwG8b-$*+EB;pfwd)W1hnmmQ{~S)@^LoNAa3?0oA?gT_M?oA7(KJC} z9@8c-loe>o>=ZMWDhTGbt149#6)UF^3I?&$<)R*sOGn%k=$J!v>YCyaJevLrc|{fU z6j1|M>mdrRR}k=#a*6?`Q`A%yG5HFFXasy~Gkw%|zUz8tx#n4@sMi|^?2hd>lv&Ca zMO1o(iH18NUN4l-MAx71-ONo^ZKq7jIt=Ujhx(}#AGHlgc1+!3hV{0BWxcHsY1`=W zY_)Sm$~+#na}BG>-c(RZy4~U8M#$~&8;)YcZHExu9pcw}${Hb!_ZnJJBP}m$R3YD4 zJh_#-iP1l8TzNs1F7G}JDjQr*SD94DWjadhAqW%A#T6b>!SN+buU(pL%``Z}KHG{M6hP+lMl`x#^3Rx3$K$wmo zWx`l)im4hx28NQ3^pNz_5V&FpoE!p|4}r^uzzGXBHfN>sK>2v}0(DYFwtPwXX7w)h zmWuEBzhCpR=u$(`aHwut&D$Y4CbS5GK8kAL`S$thd~LpCzH`27g?)v(LR;Zj;auVB zPIp&&nDee;*qDlWTB~DmdU4f;hMzG=DL%qc3SV(W#PEk3 zMzP07F;WrX1cS-tfO!mcB*8??NP<1ZLIZH*!AN8LQ0!Rt$hoIFGJoLNXW3yRy|$-j*^%akY=?9VIl6+A>NQ#e&2t2yav& z78?c}HVi@w2UrUku%9(DVdAe5%qA*4+}`iuF9JgY@2&vmzF2KyFOz!q( zJ>?$|+KlZb1o)u~(OYe6r(3~&j00tKNX+YIaOsK9*0n5#%gTY26U>l}bMuL{OzlFj z$2#t(W+zNpd_nXWEk-<`Q8~OMDF7%5M> zoc3W_j8;{unv#)Xq*O0vQ6(8hm&^kma{d*9F_829H6pBb_cD z!2eI{2OOX-BeYYIq%vl!WYlKcx*CW?AI#lCv2WS@6k)1$DrH>?XncM6s!sme2iWh z*HYQ(sgOs@lV!QNG*>E=mX^os>aLU zo-2b_Mte)&alRXRH}Yryr=d?HeiocNmU##}#Zlgf+3%neg?!a7Q$L_4dt+2 zv;;8Gl)H>bR!j*(FGXC)PMV`9sZ@%E;4=}&#~g($-Uh(eMlsWX5UGv1LoxkvL`^J( zsT;__v=I(`ZR#BqA(6Nauv%%PWa^m#kbE*=*&BkY_=G`#2ZPN&(^15=4<0njeCJRs zFn!3{SU1&lcabT?IV6QeGAZ-5sJou&Pj(q-Mw!2 z<-67z{r~vkiz|hOmV2&#{;8|3ejcH-|GInT-LJ0i|GfY0@6h{SYPsXcFW-3O2)N_T zV3A{)f1BROx#Jn$?Sd=gzxuNO!)HB^bjo9DWb<{7aREZs;!J@&|@*k9@cS(=H%Nm(rAhUUC1!H=d1A_5T#C9$(n zR6n}mQ9^s+t3<~RMobSxQGRLSLjkq8o0)JoGqH0^gymyKX?c#g01D1#GDA8;2{q{= z;_H~dnHk2ob=Ja#!a3Fjjgk*>of!jtbGLl(%PGvNdT#$k;wO+>m@YVsQ-XGf1C-Q} z3S8)z98lEaNO6rL9jHk*I>rPpa7+ngZF3#V9A9exDsqoV)>NL8Jg4%)${jU3Yiwi6 z#tdt&nc|pIHhI{bvN^+++ZL5A8rEL3rRLqr<7J;EKdI8ki)7mKsheLOwPXxygZBr9uifNq`6Q)KAQ?OO}fp9zNr}j_hA^-Wr16xUl9E zR4D}53{!AsLmNCaMaoMpX|L2WMUS%sRkG;(g8;@H5%!a@sfqDl`nQ7zr&7&URjb=$&jdUfs(C$IeVoiASV zy(NFx`J+!B`rdV$pLyY$jnB-DUX&WOc;49D9kl7a2PxhD;FhmfeDV6m?~B#H?tSUC zpS!|Wo`Ogcs`da<1MazB5{mY9sDObBTd9MjvQ*^7V)4Scb-G6)FLG2mG zi~1q&!N4cl$AN#j`t*MX^6?TMR&%<~Z7Db#gMLRmCV3`#ZUKfy23~@<62g}0DMS#K z$K%xD=|cooB;fa_d|Ch(9w%IRDTf9srulscXmBX37f4)>3$^i=;zGPfXx{5Vj~Ia- zVUFQw_8C55f$t@s;On91?(@)cGC2mf-W;>fjD)k!Inyc5a^{@^2H@PTT10uEjP6)s zGu(RU!oIaw1P=WY;EAw)>R1Fz)7PGehV>I10z<=MB1_q{L&=2F8P2#1+9n)!O-*+A z?A-8&g$Mm%hpgD$`<)xXwboX~msHoeVZRo$^S-q+vw$f89RrNh|)s3647FK@s^Ihk*&StX& ztCo~B=y_6H&1T7jdx_sdSCFouhWr6EM!I)Mo??qjB0W9?jtzmMLtvKQBF8Mh1+)AX z%<^0C(DiSwA#N=95ZE&WX6?70W$m|M)_x0S?YCgR#S-}Y4lcg(4N3P7fn5bvlm#Sm zY7d+6?~UmQV@yfMq+{xzLLVmN-^!;GLdc#-s^M4ywvl8>Tq$A{8cPC_z(DPd6y1^9 znG#Z=P}H5;;iX=QaU3t#1Ywzj>6>vJZ-ARHnK38CD85%k6AfDk97+Rue9WsIF`{wF7<-UK2bcg_>f#CR~4Jcv(#`z9cKgBaV>TZ_2?j zZfeK%*PP?UP}M#n)F-VIj>6K8oxr9p~ z+X4=<7i97amv8}`Vu~Y@ck@ibj&Emq+ckAZfC%s4miCr>T?xY3&yN9L4!q|34urI zS`@Y-syHAs!yNChVJ!+`oUzbgU@c0983i$)+1h`u%7wrS&sc9TWU1L>qrSJ|>IX`% zfAzc1btUJYv*ugfa~EHD<9I23-;4#9&po(nUtgv0osI?L?|ZWE0b$R^jk6xUr|%s$ zuT^-zr$B<%go&K{$*_BHmZew=0we&1g$6^Hk`4qhVLA{nLCfSkbg})A7-(h+<>d|u zhF0_cH5gihc49JDlK;qJuC$`SN(BqhAiNA_WbSk*YoPPf>1*l~@}K$l=t zH7*e7-Pkcz5Lk*gATbH99zU~Oyr35@F!+u~(Y?qRzXyin= zncF`ZM*>5_iB8Koz=Z;XTxbvRXg&GA?@@iw4gbE!9zX>6XUrTAri9#8$UO$zdL3YU z9hgBIZLm!Y2}K@2WM;5namfgeX5j-s)k0h{Wu{JAPlXA6KmQHAZbWJMaJv1KzL({L zU;S>&nvEN!VNehRasey_aqyEW?95uu^@AGKrB+8=(duEY>gq<bm3>l> zmN2RS?~c;KL1WRwb(+f$rie^wIzaVTU|uQYp|16DX83BC8<&6j3PXYaICsP=#|0c7(r!d*&5xDpf{~j zQ{V!Y@v!a00C(WN*UX0>GW*9d*bbo;F>4`nHS#|^ycXB^TINH(F4PR`hk;~25i=R5 z(>)BU%J;+36|dC9Q5+T>;ZP{fwpZBGfnjE=%$R8p>yi-@0(%MH1t!JBH@>%|9F?0RwP`twGweD?}kKY7~?n~HPc zE8n<%+jFyYHB|m$JaqZt)$>QKTE6V@bn(qCQ@(%ej2mYJ+^%RvO1pCS*{y5CYwwt5 zEWBXE#*<&&diFT_UR7MLnqGTe`=v9_{s!Y8Rq%Z+c3IMeJ!h%$`vj?N8Z%M{UesvtKW25D;JndFw4B z33IM6UeyGX4b<1ujfg>6IV4<4unRwK za#_Px?%Ih>Z`_8v-Mz(q(k)|#pE4@gbM6g9yPncrgwDif?bvdffM-G!iFou4Wz2EK zWDmBp6o_>n1CC{|-4Jw$K4yw;(m%xR1Ruhd>$6sd=x(-a4zm{*Ov8xo*o(z$AhuTT z*;!xW>gD~c7g<+w8!|;SHVau|v3>9F@vu%PoR060HLCVt>}({R+!JDV5JMGg6ax4~ zgM+RuloRWMUB?M|WN!ysdBPNVNG43nz;0KWQKRZ0mW+mkT85pusBKJrSy7enYZ^^&)zZdz?|6$s6eo^tE72%?AXfnC^P^AZ?_G2dWz8I<3{ z7SV(ZTu;d%i2R(8cVp{rujdRKvS)}5d2lg6Mb?wj^*6j!hkv}?yTuDDG80ivTe6p1 zh#+)>6EVVs6{k#I=eA{}{=>8odHHgCT$>i6K^gs!)jF7wHw^B)VUfqe799Jsz6u*? z8YhJ;MiGE$VSUd7EUdlrLfJDlCh~9zB3LQDkcb5Fy2_L2Ow(q-DgnT7<2z z2!5y?h?#+yveRguOMGNf>%6HrHU0w+_=T?6AR;&g{a? z^*&gCfJ}wxlhK8>0HAtwNHmf-Zpq;retXr5w{C8~zqYF{@%&X+KlRMD8y~yvyW78d z;!!GgUNk{)e>p|)z4q#lUwQX6Y>7v)4?mG?GGbALN``L*TZD?B$gtm!nU3+m3S|y_ zW~hu4xe$K=fdL8>%TQ9n?v}*exKqNrr-W^O*ru zTJrwDR5DyyZGGrXbUxekh#nGdgo#?G;|!;FZ!_P4?*@p^2ZahmtlZ6aVml3QJ{WKl z{P^fL2(f)YA{Z2z`;GhHSYReItT6FgaixacKxDqdrkcE|`iptXPnc-_~mc_L$<}U-Iut~PB z#|oj;6Ayp>%tP1S^awrR|Kitgo<8sCmmi;B^1=%fniln5fB3_tE5G?jr~mbLK6zp8 zb1y!*ZQ)2@VmAK~_ABf~DAs9?i98(1a74n~fEVpaW z$u0wpKZJh^Fbr|OqoKwZN#TRSlO!T6Lug+~Tt@R;zBBM*sOe|GECSKZw|+rlbwWsh z=)nX0|Ghg~n(v;KrcBZl@Yd$;Z0Q}t!-!hi6M6H$?aEej=B{i&uR?SZ7_f3u72*!B2z^%n+wj`cma ztysQ2&|UV)4>rEKa_QomcJv>A`^R~DbNHdHxtlgW7Wl5P@w!Df-EvD}@6WE-vv|QH zBT9aFS8xBnK4KK`AP!scA8SN1mUG;&&;%{NX#pgZ8}QE+q@BpKQ+n({7H`_t##U{E zY;{os+cL6+f|wEYSSEPz$dMzgv=89a`3hL9Q8z2C3sx4B+YCgp4(v6_LSQ)~D(n#n zMp|%cmStgn_BO1PZV>Jk9LN-VLIsZ*e@YEW_zZUgdBM`!}IG6yTup#T)Z zehGGP!8NCiNJU&JH~MgSz7Ty7b;^dzV>;Pb*jk-T6Znrgf*f%iq)qfzY-h^08-;y! zK>EjpxwC0AqdcHB+=RpZ^=zdO+NLUo3{~E#vQcImG)5WWM<&$c2F!P)=J(69;CFXTF z%Fl?7)Eg48KkSl4F#$i{-LTE&h*Zm0PJRF{r0h!|aRr zC0G_@{;c1S27<0amVO#C(I2{XC?#Rd_`O);$+~*v zxCVAMGV#)e9fU>lC02@F2{39x7bV zwmuu(07hwTl2cT~wkO^5=%fDV%~xMIKQ?aE#gkrtU3_@^+LaAcF7bUwo6>&y_OF*R zIw*m;Xp>&U&f-B(zu3%j|yY6Dul6nJ*-yAWidb!xChK zk5DC4;ugzF6qV9+WvT+Az~JH`>`ypAzbf*D#Y5<+iUeqhkf?0uyYwF8L;@r74^AWi zybD1CF@sE9;t^oNx-FHove+7mVvDyF4-7?dI-yapBd+s62IZ_ki=+$Nl)z`rJ=xO> z3xcpdhyK?SM6d~!XjD{`!KMJwbZ-Zx zZlw%^zrRCKu?w!7Eg9O=;pT`iAXTtNbKXoxr@Uw21GYRFA;2UAVHGGLF(0`?u-( z?kjq}Rkvo#^owqpx$V)z{aWBTvFpP!2xkpi>x*uq~0XYQkUCAc-Zawo`ip z%O|lA(*fhR{R~tX@B+Y7V7E|VZx#zXZHPYpjH3Vm5J5%`nQ0D3W~1e6)D+5)O=;VL zC}P23;eo@#18jxJ27Z6mktj$E5FpT~|J(MS1!{{O1v#}HvZRd1PLeif!bKCVSuVCn zR5}Z6s%`CnsDQ0LnI?=Kr%d8&#zTNlA!fRymK6Im_eJ}Sq*cOr^ptdVrKIc#Y(Z6Df_e3?E#bI1fc+!zDiCI zS{Wv={m(5+el7OWyTZOqZP&MnJM>@3uPD9xN!?+WTd^I@EPa_Hr~lLWPuD-)ScD_F zB)5qF{Uu`)4rTzfA@V_%?D(gCpux-(h8PX7Tz zwwLhG^fgck?9RU$7*pXObwF-$U<0)TSz;3}o`wG%_<p;LF0=nL~&9b+}^( zWPq(F-l`uy>^_VD1oI5UGd(^-mhpB=9?@<)2;~^g0j3|fQiPjYgy_GWW4rB2k2unB zb^vXu1k`nM4^KVcFQ=B7Ym)HqB*ik(UnZt26&p5$`?WClz3=yZ`>}WE-yfQS08b{9 z(HHwC3G?Xv2flIV9neDV2dn!la2Ym0Qcd4CX$TO^7{=zIBuzxd}+$_V@nR&;Mg!&Y3%B&YW4Ed1mHWQOD?PNYimzT6Ar5Hl=Au z4MN@kO-+3&8j0>4$CxUnS|Lx&LUH~y>_v366-8RYPM2oWQs-=pl3swiu)2`7V!_rB zv^=uc6+I0>jn$JURw|E&fiM)cU<{e$(<~UDrpStut&?Z4^dgP263T#Smtz4Zr;~^-u=bd zPY$fvxpI=+5SO^Rwx&Dx>$^?%(si>NE+9(FL_pYn!j{ZO&^2t-d)A~!jQUB-^lwjE zM*XB^`Zp&nur~}m4F=y}iyCePEqi(Iu&Pb|C!p-a3uPu_;zQ3#MIC8_3$7YQ(JT%E zwDl2)2kPGXK}qAYA**H)2%?LKOGgSD>LkkQGd^MrvhQ%36L*0gpbg$=4aS!Ef$Q zq=)6#q}LMvEdAN{XMb{vAz#WfOfx9U4LA933B49lvJ&$`lM~BAHU65E>R@%o{7{p5 zY24z3Whu*2n}SQ!CF42`s}t6RE(+a{azp6%{IU#?dK{5g{B^x;XkPMk=X)c+YGt*3#=JzGt6{Jo)$_tA74PHv8cXBOmNo|L_GL-S}qYk1X(Z z$GPXU-?=>h%Cd{jImFI=>rK|S|LMq2-`X8{@A|^!KW1e^%=*K~8xdrV{5pGD5}inL zF_#B1m+fp3LE|a*^iFqgGSuV&FYXX5!NpCU%jTHpT4dN1u|(xGaE<`Eh}qI$2gHyD z4S^Xr5swI(GBn%rA66JVfx=&mRRN1Il$t&{$I8CMi9%%xxP^)}#|8s=bOEdKcf+_V zB*z7z#>ot?7}88eZnLJ@Ay{&Nk$)P6?G6Vd(?oL^!X5MMGce-G*y9_FXG=*An#EwP z)Y3Vf4u^0y16IoW?h_jI8`Bs(6hU_qs%+M3 z(i|8vTF7w1Xb;DaL1ghdA{%as4`I)&5#eY%tsSn&ilo?H9}UCGJKW+dR}-0y2zHcd1-U|Gec1kyyB{{#`C!9nCJ?SN$!l7aZG9k39I z+U&3{PoP6pjKzo}I`uhD=+Pk%2X?_!q`gNzx__q;dh5k|a5R>12m~a9dx)NgW1M)d zQAL2dgdP5=amvb{w?)2w_1BTFHvW9hqu+adk756bo$o|W+;;LEA1c?Q{O>?mJ*Zi<~qqRs@VhK zZau{jC5vY@DTXY=G^H56&<#E_>>McyR}5Z>Icywyvv9OxgU6(kx(&h%VS^AdqA78% zt)NvxG!$z}mesn|#)7sPwmO@P3Gr39gounbAv$Rj18>8o6Di;VW5E%!p*EcVC{CMM zTaq{a(bGjGh13GM{t}{)(u`UdA2>#~aJ5Bt>6mQ3AM zTzPAa^3iX8^W{aiJ8N!I&i(rCXKLH9d)bF6bp@nQ$g9~xF`ra8C^4EW+*qN=6%5uI zc?GCF5?5*NnFZGa=PeNCX5h$0L?1HF1O~`%oC&E$cmpV1&rcM~<9$ z01OfM3WeOu7l0g_EcD!l30x|@;GWRIWR&zZmiD3O=`mY;yPD`Qp&gF32rONYdf0FmN()?tzwxF9Q+*%U~wh z7-bKA6j{p-z8ksc5@6yd*00muT}m~;vZtm@!)Rk6NfLv(TJo7ge0sc1xwohV(}o3 z5eU?@WT0jVRILZMMwTWS{`OTozW-xT*hpj5!nwfz@;Bwzu)PF2)mA;T-<}AHkl0gb zk>g>QXPUX#Y!_G1jZ~8vaBt=&SvFe~&MhXhB4foS#vGjc0YS(*A`tOt`tBxKHA2!V zx@IF1U9$}UOBi;>GODfFhUGX!sJ412tZszLs=;QC?T6vxNW&&@afDqCWG=B#6UO^* z2nbPPs93A<&KP1SXcd?g%dNCl>vRP8$RP43+1nQcgt635AW-m5%dS*}g*y$>X*`cU zVUL?^R&fF3V1GJkB3WPr1nx42XA8L9GrPNp} z=VZSBxf2mcEnli!^3`l*;OhbOBaSAfO@J0Z|D2%ZNjS=iZp1(!1>T8rWMd{kF~F`d z7LBn@7?VRMUXfIpi0^8G8A*+L8nezELvO&5+StbS8iePfg85_!RA`}z4~|OJj}=(6 zpL`Xc(b&WWJsvJ@%9TQKxt@@}OqvP@NmKnb(ww*&&m8|!X-V8t&l11;HuG&R9_RP_ zXD)V&YZQqUyIFEle!^tKWP6pN%052A}m4Ta0USsyTt+VT_{y}JPCU~s{_ki=0XgY0*^ zu!|SAy}94xi35c&8QUl~Y$lv7!Iu`U2?$}yz|P*I5{CH2~PLhwKhSCrE5}h8Y5pHE1wk$QVyB({3BIUxcWH zzpy+G)iMtR{mJGcn14`LVs}L@dhWf<y=b!NnaPm^aQp*ze;QXNdkX`~=T$e5f+`KyqU zM~X%YLgXurV`)krNM)%m%ucbfbLrUVv6j=30n4*-saUOOrbN|`)U#StdEzSuyR>B!kBFsHV&Lj-;=+-+4BdM>rN#KDA z{D}Q^Qs&E6=C+2dE3E@oMHQmDs*uIQw)z8dGlxU05YNSLjlPb^$6~m}#+w9=5c!Ze zMu_z>Z!s?(70uPh8_PC904zC>dEG;2wB(>0a1cAvr~wCI6eVWB;WI!W5m!#?1{_c- zehUW&EGY;B4s7f}8aqj7wh0C|T;LI-Qv7ME4RqMSe{zE-WOR|2OgBi*I460hf?9$N z4sVPp!tq^H4^3EaAn?((&_K=?cDAzP^H?LV77!eI#q=i= z-)TB*;(s@@ADZtm^Ir31X1>_m4wn#+{;Vzv7r*)G71Penos+m?5$gTj$k}`)cF^wV$=90gVv^fqg2{)ytaw=3?!>vOI2>o% z?mSPD-7*9)Ok} z9VV3cC#Fxl{5{iqUjCBlB`-f{I_Tw(n;!S_?WXNszQwe~%P%%v?B!pZzmDf!=B{|Y z%)Bg~+s*cP9`7}qV878O$*wP*=xwKiGkb+Ys&L>GP#-SzcAG9SZGlZs<}Hn{a60T2 zFlZb0B~NztGYm%sjtv#Eyag*-lK=b%Vnc$l7oMgSjA=!E?rh&VOGaZDT_esXq@QW$E2$@kZHB1AlO2&U0)YDNPr^Av#CJ8t zIPhdDJ(hLU-h>2>zd(@KcVhb=rCyjIQlRflNyk%<$EX@GYvYdS!G2|E!{l5mmff=9m__6wrL@50j%QF3tcgp^!!p0&`abg&L%hwWV> z2o|zzG@C3iMF+x&)s6+)qFQY*1P1|Oy~Ab$6JKtqgLo>bVeSso$x)NrqhiI{K=xY&6ZVf$ zvNHGYaX5dTPO6!jKpu{w%Ylfr8*GA;6dok=gjhRNe89^mvjGOd%@+m)h=&iAA6O0W zVI!9^x?kI6wp;AVeyn8j$>W-2yN0>x{BE(}pkzh>L*m_?g8YxuRrK#0yE8O{#Y*Y6@Zu0WWOqY52M$<+wZ#TEc^F`)G(RmJM2C@|Yu$N{wq^`eoIx*E*6xmJ`$QGy; zvLY3B%=T=D?=0-0XtF!{c?|y&`dusLI`LTq?iCXqD&HC&Cdmpc7qoEb%f{$bkInM) z&&RA6Qyo_1u&2GttOk*X(F1e zsRHBXv6kA(4TZ`Kg%dOV9+ytCjgBP73=_*eGI9#4 z2A;-tm_{t{gw1*iDOaFm8Yd#=#z9H}S&(&dkitVY`*t7B+D+ihGsi>1l1 zPy!U5ZJlDAYMo-|fQB(x{02`|+=0T54ukiK`=MdD`$GD=;aNa$maMGlW=m4ibR~}n z5JX^xPcBfK6-Cv^$~eWV8mSrnx(r-l5T^&d!76iDCRO(G@~{gU0CyV(p+|qzpcv+v zDa{5m;3|Z$2C-$>%dZ&zmy(5$S+%MEhK8o|7c~PL7{9;k#_8>3oIa*XM_evAi#SG{ z2ijp&U^hsnB@i$yL^vjGCqXzS3Np5EAROc9@5ibT7+n?i!6nP@N}F{Gn*yRfh->KD zPs2V$jAUxk>~^{*TPS9ALyTXLzli#V=@DHaXbFiEOGp8h5DMc0Wgs)YyC)IRL_tD^ zjv7{&_%bCNUq(Z+I}6bmX^jdJej4EY7F;?KD+{1VBfKACD1_AGDI~T+Xj%#30zn^6 zXDl7`;ah&rf5%KCxAV(RN+X|r3IWJm{>I2-CvM{({57HgzuW~_Tn1QlL1tXz?ReB6 zn`PFdyW-qwF3C*piAOo2r@ANBWsid-v<^W0nMhjVpfbv{$xKl zLf+Ntz?4f!j8F6?%Eo{Uabu4Y7r!|roh4f1(lwe+9{#|13+$BKf@ux4iBm;sJ&DRAj=auKY{ zl#cOaAvhg&>V4uo@EzjJY-q=QY8xWfDuY@mx=?x0IF0 zrRGvgsiWLE#Z%(7dc1^}JaoYlDv%MBL&be8k_vYrgCbBEUEtl&#<4dw<=FC^A66cy5mF4DhZOfdEo;9pp0bRq^>1_A(D;Ju{UFl}eX73fsHKuE<+myZLCp^DU zo;Sat{LcKQ^L5W(ln>1xIzR9f0HH$N$qvO(cOqTErNtFQfG;r7xo%hsR-6;!-F_?d zRPh3=I8)3mtT>yk*xLzy6zGp2@nK&BS(oKtyO9lNVZqt$_PU@&#h@P<2XXV56=$y% zL;!fGwphJhHD<%vp^i1+%)P>Zv$-&VC4g$G+HnrTQu{Vuw`V98Lm<2=`(@ftBlm~>j4;Hv!<~a%WoZ+v-A1H(Om3JXe20<>mVfFcN zPXHJXn(|nPhEmD^3Q!YDDf7bivy9+yCi*J8o(kR9ei{jwuO-IS}6V63qoHp?o;yh0{u%N~E>q7I1vDjW+ zr@&fxNyPpaW?hsqX%@?VWn_fse;U~mOrMkx*~(AwpGB_fpHaVrT|QEK;!AEDKczk` z0&)f6S7ijV)dBAf6U0s@tvzmCJfH8LAAgSfoOs1%hb4ZeFwZrbo3qI*-M0il?Tb%0pQ;LvVV&UJJXRU}O6^7M4!qeVxZ zJol#BuA7=ajyxZ^ie2>NUCS3t`a$Gs!+xiyeb4#>k&%%{WOm&p=YBr{rwJzA2HR>b zjDTC#Gz86gHf_z}h>S5}L^d682qglv+35fx0H+ItX+*GWe;m%Cm|f!*y6iHPR=F8Y z$p~_AG$ulJR2Bqe%{^3z&5f-J*`5~1S?w5s6tYc;6f$iB^jB;b^0S*ag&m}WfZel3N7sv$ z7`5eiA=s7zGm7IlB#%ZnM1(T%7YJV>aTiP@1TM~y3$@_uW`w_N|vL*xl}$! z+2r_+^Gb)=#tmjzDstBG`Er#hY_4_8bXsrYx68MfZZYqWA21m`98+hafkXa*n?X33 zXfPvcwl}yM7;J@evza`Dz@3@XP2;rEGXTSd`}q#==_WxF8I-$8yX_!ZL-vXpz%D>B&n{AZUzSQ? zV(*ekDM%Fi1#4GmAmA4mTtX0z3?aV1oDPW~V~qKi>^>$m6QcI^aRY54rhu^YDOf!G z8W8(Bmr9t|Mz>3gb z6*iqGs}ud@nCQOzZ_$>1Y9RB{;;{zfhSrbL?w5Dib~yG;eAV-6 z;_s7xpY~eu>r8 zfW6LDyX{HSNP2@rVvpf0TYHS8IF9QYlY)*W4olF~2YFUVm^BucWaY77Zpf464DqbA zP-4iMWeg=}73WO}N+nsiz#CdRtW01(?6V-2WI}9mPEn9e4V4BZufv_rQe|8cZ2oj9 zV8BHKtg-7brNB}$TDun5@0Z??DrAH)?U(-FnRnu;cF*iy4GJT}tQ z^o>(UMHBjXnVu<=d8QZC;HV`h7;cIxacTye_-8+3YhQRSa{DvSL~j4(i)`)Bwntuh z=mA!6-+ip${s$w!ZM-P$@U??)x9!_}`JlQvt-r2!+w~ilU1fOXaAe!Fha+k5qIYIw!Qzt4-U6pvOaZQ-A``6{mu?rdB*%r zB3jPeNGJR_odsS(3j3LWmo6L6=Xw5Vd_KRxZk&-YBWZrp*0j6R43oW+12fX9yj1}> z;|;WWTLUZ82GU+NzUKMB_>ujues?Y(XU~T;#uR%EpKV{pJNcXT-~0cR_)*daffJah ziX%Q5HhG*z_xP-ACmoUeS)QGltx)O<}hTkL@EF#CX^SY}qP8TsFpFrZ=7pF~a z7+j9a{DwUpV04iw>RUIQ1NRET#0~^9!6g<3mrX)0*(Cho z(*4TC&z^tpNctn+>AnB)P5t*pI=Ok;LNQ+ef>Ftj#~J4 zN-!J}`dc3nbVQBZ1YIl$bSkqz4u*j_95AF4FuCV1L5)Ok{DLjgsP+}b(yD8T_?A}W9nK@o7!jY zPr2NDMat{uR}lnBiOD>-JsZQ+(JB3*bR-2mt_RlTUz(PiUdS6kz6X)YTH$Js|Fadu^^8EQKELt`3H0dy7;?v@Nzhvsg{;RwW&W$s;F@s5CI<0~MH zn2=+Mn7btgJrF2-kPRFHCC~~qz!J{m5&>1C7ER=@VZ5+rT~p=aRlM@aHM>SOzx;zg zMBcyi>JJ}%XQZU=hJ`&px%Z+AA5s=M*G{aRc-F^nx2}wQ{@ZJgUCidQ@3IG<-tqGj z?<{|)dGB4f!QqlDEsFXR^GO-fME;9Wh}&hC0%;TIXa-wNeDQ<{9%!K(a?(7GG}=;v zqyv!KLl9@l$29*`5Xi-Rr^!hX@zCWbnwi-DiU@%wB2i$KGZV0ngeb7; zJ7fIB?ieftIuINa0>s*`_z6)F;qi?sf(TWF)ZGQXd0(VOzfWoS?1?bY1!DLtVI}$? zU8$v`?bAPoZ_$31_UExc^QCr5BFjyzNvsKdVE<^M!7`D3Px>DFuF_{d-_~RAcUBTjV(8l4e)n$mg?T9~EM z!`DdpW9Co(7)+iY!`^XC`Ms{N3Y#q0sJh%RM822ZxT^>-#9bhKKhw`FjUR ze~zyi&eSty>X|aNOv6o?y^s@&`Y1~XC-8)=lTPtbCgu({6VaIyrL>7*TV64GglS!3Afc23IvQEnp(TM zJPy+3C!={|CX`_5vk;kaBul>JU&nyrg2-SOyUmpmW`a&KFE_gnAXz53SNf@iLJ@}$Jrp)F6d z#*QC+e?{w+KR9u+c5B`J1D8Dnwg!`Wq+?3CvIorFX0gf#r41JTrCHu!Fc~cytcvwZ z1KTiz*MUWtWM8sOW0yl|39PdBL9{MBj#Kh+z}pkp9UH7r0LFH7XBUKx$nH^5aiOrM z6$b?)u_=YVA%ip0Ef29Rk@JtSn|9EBM`Q!lWIcPCcgQcICKIV9kQZf(!cL3vSLDBe zOa1mGVAg|ycMKNK@WCNDY!R~g*lKz8Zp40S{kD$IZQD9Kw((zg-h6W>;wMi+4&fun zAv`aIq;sE##HVyFOfyPSW;mJkdpyunk}TnP-{ki!pIG=rOW3m2B3sx50Lhg6DbTV% z!Cp|h`8mKt<3%62*c6E;8|Y||Bt(wMh>-gOsj!wRwiC&Iu>+K$VW}v+?1dX|L<7!~ zhxul4im*vP)LMwm%&_JJzN%K0(a8E}y%namYdmu0|0{1bTo zfITD2yG&Lf$3Pa8Sdu*|#31YzLC}Zf7I3bm^BHu=+!X?(HL%967J>>)F@Zf|vGX9-OM<&X{%=hW1k&m=1jnXnCWs6o+h`vaW3^Fr(%q96! z6&`9hTWh5Hh;f>ssZ*d&cB4&U@wMb5nmCL2%vBS_Cp`cBXXIm)T?YA;@8Xn_C*3K^ zuASpzIUX0(4C!L>+nkfL;E;K8Vs@gGo0n$~W~pBIlZ%6fO;CL-;EBsJB$WrULUOs` zvApc^&|`UdD`1s~OP%=}J4BtpeAk%1#P zO{Kw(Dw}7?@(rzuxpGCYbX7sPF^Q`!wY-hjUBebmXGrL;i*;O6kG^A4A^N9B^v``7{NPwOjGoB_&4=zlOdk%Te?T|+I9!H^FUKe4 zGUiaT(&L<24sW`uIvtOBlFL)Fvg6A=S;p-0oX40_ZhQ=GH3u+wodrL-p14$kOHRyj z@&ELGA`?Q?Pv|8dP7dktIjV-!Pwyp`;N%=OXrZ^0F-4d<14p5Ra9%dly(tLViTEbf&{d(ShN#LyN}mj9T}RkxP7|YcIW)>e>JQ7@~o2GZyXNInsTQ*vE#1D zZCBT$kM72~NHkDT(Eg&? zOO^sovO~Z3`8EZ!hhR}$itT~y5Jb!d?BzraJAgfI_;p|bgr^{qZGCDQK6ZvH@K8Pv8 zU>ORcVweh{Y|vYDjXWqu%3s|xxp?;SMd69%Lqn7M7t~)oSqXjp*2VQ{x#>ysMk0o% zB4_>c#>S=B9%s*D>J9?%w+MV=k#3<8)pk`wT!I<&$HO*rh0$s$S1OFOLBaY-$do+SW0{joVIw4=32`PLDH2{tWwcKwg!Y$`_a@)>66?b1Q5`vO z;`7MO&=BKN54#6ztP$&nN$L<#nQbrv3tOZtnPth`l%*)ftcjd$=PzNMJ!p`UEiB2b zt+wxL3k&$fB%DGLX{1_#|FqPQJzh|Mj7DhR(z**V9;~~9tJZ2rHu{||L~ViQtqe$YyVnxWom$Tvayh4jwD*dS6H(Y)wF!`Kx1uS`2JMP?^X$}jRuBQ3*h2CEp3 zz=oPEtvmUg5mX<<9I&KPDMmBd7{@+~V}%99=`%uA>5b`+3L( zoLr2v9B^Wn#TEEiwkrm+Da9W!eF&k5`-W@^4x+)Z+h#9E)q;V-z>I(#sP!M@Eg+`_ zVH%kr}RM4K%eSCz*7Q8fZ^C5~rB6ARj&~U)+P%tkp zm`hAR{KxF1bM5&7$V9N<6Si<5kmL6UsTD4#B;`0Ezz4Z4gF!6kppn9~`V3I)-0v6R z?LrL8M0lM7FHMe%8gYzc;DKk7g>@Wy37E9KI*GtSit~M?_Rg<|r>?&6tlJ8^_;h35 zSxgONRaeifOt>|5>*cpzKW+T>#te2@nbH2`aXFFaovmIJzjpda+=xvoZ)isbjJ_q|$i?p1^a*xc+lKe3Gu0HICbXElvRV1ZTeirU@ zFE!18P|#7@n@1M4(yf{7KVrxZDTZn)C@szoHC`D%|DNK z6)d0owDQG|3+CkA@co~HWiM?$`GRpf;ELpS15&nB%t{4pS+^!_P39fumRq z_xqFlfdp4l@}zvvlYBcg?Ajrsx;<=@larD_2=e>0awt8B^a*_XP?jy_Nqz@x;7ZaY ze#h{*N2VC*cLIL9P^3jgH}p>ya|e5UG7GHHfHmJ z4@$-Aq#zp~LaZ@uA>Vg@cwW)HGN11@fx zCn&|6aEX$Gu_qMkBrZadA_s8QK9oh$cH0kQy6W6vfGn?+qz zUIwVKG3S=C&oA`dCnZ6%g*GCU%4>_u3DIhGI$rz-1 zB!t_5Z(+gnZY)@vWFC*BM@+#I+Lcd0$&%^%m10C7oI-L+|3baeK+StFvgAg#LsPKC zw?F3JV+Z7Uz*Z_j({7ld#eW>?Pcfe{lx&028 z$?bD`jB!4v7yc%li9EsQOyUWCM>07%YZoN5Rxf$i+IJY~ljNpAQzh_u|P&=aJAA28GKt6*{b-IhKw|-E;CWLg0 z1?O!9I1)@SP@8ZcVeE84;OCEe~UgQ&2xa5{4 zY}%4rmpuC7e3lsb)skD9BEMMD&&uaVo=sv8ZDZ@Uu}33|C~S*t+ZJhJ4@H{z47N_d zCVK&AQaWdAej*)o&KAe)DBvx}aZP~A=ovc>XKW~O(h(aSc`eA+e(V-a(i#CL`HGwm zKgtI+=e2vBb^K`Fi{gBtsnL&~?j*GNeYHg*Jd*1?=?55NkTy73qERO?G{x zbDJ@n>M)fL$=71u7^MrSH-s)=I03UAGJGMP3-{|B0`LZAtN`J=J?!I@EA6w0J8I;5+2yY7$_G&8^D5C zU&ZEFvk>>mKGt$0Yu)`hs+4}>Jv0~f`^GM7i#z}m&<3LnbH%jF!j>7kEFLz}E{kis zEcURf@MN=6!D!^7yDW%x6Vs43S+!^0WzjC48_={c?U7H}WqGHt1jZ6J$}gUX07fp| z!R{E2JQI0(cnnWWL}mFZthM0#8tyO@qX7Z!F8@Ya&7mgAZAK`0<4-v%O*nZ-`pzu8 z1qnDwYOI~5N?}PodBX4tgr|#5XR)C$2uq+$NlM6Y09jNi0nLH<$5($Swwvx=;i~wj zIe;Y_fA{_=8?*>XdTsSnmwxs5$QsB8oQJV>GPfDhi68bKC?j+T7AAVnjyM(kKo359VkgpZxU5T`Z zP=_wm|GNl_P>vDzIpQ~-p{yNLHr~4ncMHN@sON2X_ZGvSCFp`Q!3{_!&OEA z(UGnq4zo_;7p3EL4_^vex-3=kqG((hlLu!DzA2h+ME)D1ag)@>VfjFmX$GMcTV)Zq zU@wZOh&!A{`I;!K!4gf})@U4bO3$6qIOvp~d!un#8s~X18dt{TF-UgLi_vr=@{{Ht z{h6dmo_C{hn4FKhH5#`_)n3?S7I_>d+1rTPFhxdt?a50-oO;Ne+$!P_cSL%xh>M;~ zzC^@{%;`?vBI1~hJS};9G_Lg_l?huyq7SvW)`wbL>q9NB^`REm`cR9Tq*WVa9{?wqzV-@w(>TjdE<(G(bD5S^Q<)nE29Fguu+?l^f#Jz}n@>@haeoX!Z z#NGK?+Y`s6Cn4_4|Gvl{5P5DC@zgPSf@9*DBL6)iPo9Vq_l+szMO?HIW`d~ zeN4JtZ_9%ctak&dCP^iTHA)>)JMOh`Jg@6x^t-cj(afc#_(l0>S{U}F8 zx!owUQ))%L74eOD))V`VD(SVKC>0||2!Yx%5p5|zya8o(;hhe|=OVsAl+h}!i{_n& z5<8GePy12FUi{u5YD)dkDcaHXFYB`!xqI;LOypdJ?@%r2dsN!kHfrTi-P^^t=qvqr zvlYKp@y*p}>t?)5y`tjT2GoUmPDQ#t9yRDARazjvLgi53P8099i?`aP^{55)G`-t~ zCt8j4ysAj=#aF5SV5=QhYON~i?V$JiP>W8y*^6GN6gjCy4S2E!^`gG$`37v1kI6}W zJ~pokpJTP6&OP`VwVc|T3w?cRUvI1qQ}MkrsdP+9wP^2X4OG-q>&F(v`tWXxXwQ04 zBN|UC`F{ex--Ps0SgaD?*^Do5K>G<=SK}XzGmX=Dga&+fm3Xd73&oQTq)Gvs35|Zcs8hSBUz^BBeXXK0qtcQ%Sz)0ecFM=^pW`c18)95jaO zQ9_@1MsKYW_0XaAbO@vp=nZ;Xn>(kb&yn) zNGpZr5#f%pEuaxqG3ThCT5&ydpV6G8x>D{|^bz3*9haTn_f&qDXnhXy=i;r=8F}W? zwK|+$v;VYjM@y$M)ZmlGwii92!J9tA&(s2~O=qeX;S?PPs3lrkXq?t##5L?sIkDz}Psx@8Zgy(3aG|aydeNB)?{k;ia(`K9o!<3V-`q+<2XlBuDM7of&s#a2`CeGQt|B8_^lF9tO($*ZU zeKeOSr62QH!{ej7IBGe~Xu=)@n^AcA*JF!Zi%{B_eG5iggFvm1Xx?gjA?gneE6~dH zFJR@HqIhS8NsA4B@q zden#J=4!xz_M6~MhY-SB+Fr01HK*s=3ajDdRRXJNutY1nzH8L?Xp~=r@#6^!whNJW zAO#np636@D}wB z*^s5AY)WejEdY*pi5JEj%7p=Vx zof<6aa7JI%zDHv_wk|qUkM{L9=ZIE{4q=N$J-%5(RrETIoK`RDJFO4&H%~7abMrwJ z9LW;3v7=qB?cUJccfrPXbyj!J#_pb$zRvCq6V!yg1ytnRvoAWp;5N8B3^2I64estX zNW-9wyX)ZY?$)@wyTjn_?(UcGJLiAbdGEfrZueeErIM;$mHn%|l1{p6q=LRa$6i)k zzjAqMePzA2q}>X9cmJTF^4M$xG;d|OF(q+9`@1>2wNw7=SZ{Wzw8073mL0sVSn?Rs zZ~n)c0@7SqdSL*`(Rws!{(Rn1B4c5#S}0>>V_sU{{PgftwRIq|wxv}(W2w2cthT;D zI>HOZwdG2&zOl8lsGgv4&NEHhZF%k;(dvxPoTxspP*HbkX}Ke((KB^OqL8by7h_s&3t zT%CjCLr@2-xxI5|OUE8~J(1OU-g<#?eWkY5nHliNfqSf(0rW~)8 zXrj}qw%c698Q%%d%d^62DZI$!@!LW$G&Yn&Bc`UTK2>&5==*{@424tUUE@8*FLXEd z9MPc}w3jcq4b2dAc*?cokG~Is<}17q=6|2HayaN_?>p?9?{I$Sk$Su^FGJ+HGnIE| zxWkKhIE20|n64=dAsjmDQ-5AwK)Zc#*e21 z4*t5bT$1v)m*7knPyS3%rT`I3q;}Tl7w41!K}H$(bT1k{QmS8BYimmjB#cOFi=gh@ zOc>agJY#@|*01@X9;e>Rp6MosfOE=aEPRzI8nX<}%M>h8m%*yT z z@@uP03qle$2u&CwTPfz10_VqlpqUO?B8)#u2ocF50X*=3@7Am`%8u6Qxs8nkqJIH{ zldF7oRo_@$-KiCcS(&`y!NS|my5^~J(C9)Z2C5;j%Szd|rm3t##oe`34DH1q+`!09 zJz}z*9K9!@RG4;0|#v470=$(OlN*%`eqFe~vc2{+F%lcn{aHOj~0W61*a!2`QWvNcqTrI@i4P zYTdHFp~YZcCtioE)s2Dn}RtozvQPd9gkdInii(^bB z7Lz$TOUtfF@Vc5Dru*aARM9y|bAa$T>2dTDq9avi3mQ-%i!T2GQ_Z44K|M@sQ0%K_ zaW7`xbaji0sye-(CU$0Ek#Xi`msliH)k*(VpadBfBQ%nzh3(&G3H-)e)399e4EEjA znB`rb-HdDHYoP$oYpkV zI{*`h;Id)4JaD?V!Hn}%H@?ZFA0L2;^w9B6z*IP#4~oCoMFud1b%WWg129ATm2K-o zSq67(Qt=sz={Jv2=ZBAe-lRmnA#I@W6sMdkk|HXFY)bx^`aadHEnOI2K`i^`Tc8FS zJ4qdE3+x1xW%L?EHxSjH7;Kmj1~vMdG(4;UysypgXh5838}R+ivaN0AJJyr{KT=_i zpIt{fboYIBI)||g;(3C7knsEFsQgItjN(`N1EjqosE6StyW^?4UoMsDOsA4mcB+v{ zsxY2dp=F^K96dWQ&?x9;f$2d-);bJA*n%$jTf{x1FbZMa37MrZYcMM)k)+srDqH{w z#1@*`J@Pi`+-dqHN>4uDo6t!KgHd=c$}C9$8|;m^5x%wdJCrP@2L^L*uJXaV8KjtOTfpFCYogaG*F{ zuA|3-@NGpC)VyN7!O>EL-kAgn`L+gwGHgXxXb!Ffq@u|1MWUaGt}c*Yk!L7gsqVNc zSiUuUGF@P|(7uuhgnD0WZM44XQrUKpVUUgqe!4{Pc_JubzP}??SiiGN#q}j@vwY+9 z##86`;7h%8*U^2Y>s#LjX7KjukJwi- z!%DLI7FM4sj4kT`p|65C71m!9I(;r_0u~ICj~%hOTnPYe^eIoH!jEHm?=uR&O{$n< zK2imsb(e_+dc3L6JGVQ(Jzzc6Zol)Mo4LG$fAlM8DU{Z2H3vAaHnrL^^w~pryK&B< z@oM4>BIiQ;H(`D{UHtDzb`K$6+X`C=A;WPI3l$)71?NZgy?Gt!an|eNAq1O%eNk-* z7%3tP8g+7X{}U|SxEh(|0J~;`X6OL>w#nUhh}v?g^tn7<4#3v^X=QfV z!B;^1_kx>Sn^Ky8(xAU2sNz@n60#Zw(C~u5=f|rPnl|4azl_YyxeBJ8G6-fO0O$*`il)~;f z)-S3tj`~t&m(y4x^U_zFkvW_F4l8J{dL!wZE$CO-y6!f53|@h{-9b||n>z982{k{k z&qY(*{aBPH;MION7f$iG`=KcYdeEf?kL(xzi89IG+N#)$?+=r{uZgtrJ;Iz-QqC|d zEg4ezCB*?LTTKu)9P!A zO@gOG(bTs!!)rE0GkI`>H6EZJEL=YGs(QR>8peFjOf$BP%WP63H0fdHwKLW|^ZMj2 zONj?1eSLtOdOWGw5?76v8pAzQxjdRbP~g5KDb>Quc$fl7AwRH9-$|(U4STft^?Bc9 zIIGDl$UEo&)Ks&iYT5)s51}b@cyn4c_(pMmFe#tGzaJ~ z2HLwxB9Pkg-ud3lagP7K^Kib+AXKniJUQfWA8Oiht-=TO?fXc&%+wHT2S<+EO0F|@&iE?Y5mYY=@_qnOM z%ev$6#M1pOuDdvCxu^NLYM^X|KyTx5N!9ReIoGFDhw9I$gQ-TTLv4yr#unGMcI=nC z=K`MbnK2V=SIh?k>soM2b*Py%)9)=DnH0ZVv|2v4ySyxQog^Aw_l4XC#)P23eIFi2 zt`crxl*0(QVopQ(ul0H9_^Ptb{_xw8anTzU#&0ZatS~&4@s!nlv_A?f=?fqRIgJ<9 zvAmN=-~DPTC~cJY6FT9=F3+^RuPNY>{hnU|BJC{Qb4wjpLyYjEvjs;TrOu-rWLDCG zEYPH#4&MlXQlun@M_Yt=?PBy77bnn1;xR`(ndPK>(JdvBJe z$+%W&{ze&?&5QDI)f6?@4)Z|T%yyu=JDt~LN-1~H(u12KcLqoKpUoBuEwi5Wz#jey2x z)~3V^lEwgQ2Qvp(ItgM15rDnXry>JuhkvB(iM3g{KJ_W-!ZG|bum}8?CZT^m!ZacP za|1O;Wdmz_S)euWvooNowb`dm05L1$KQhGp{Qt4(U&eook^xwoI{YSP;o|r-&CJpP zVE4JUG;jcj0REN!6#H+FT)=eWCe8f?zXf0L_YYQ+L@+D-q!VPVbR>S7MkZT&<`?x@+jlX#^yqJ(2;N?+iB{%j zsy!PyH6>Byj>rLXUHiz5NXc!S@&L8oTyNZkgBLRylW^R7}eczRrunlhE386#WHX^vXXhb+jc%d=bKP3ZR~lSdVe_n9fU8 za`e~E1rth5M|b3f{`9Y@j=pf(R8!WXnp9I~B3x9AWg=(Ug0rE^!oKWiszSa_A*DHj z(x`KSzVo5i3fXI6q*PPfigf6Q#&@ZW0oq?PgyPv1HpRk5vISd<5z&8hpaur9{cfXT z1$A3eM}3m{RuMsgOG<=@7E^S4(-}}1_`zZ{Ju>WrL$|Gq{H@(SKmZ3NXe_YK*H2!NCEL>Xco-p?YuF=BPS!JA62XZ@c@ z`xhGj8P)$!s0mAo`~#VP_AwLx-x2hAxU#Fg1Heks+64G7fRzBIW}jeiC8iSk1o!6y zYzBEdV}RX1u%-GBY(JUGjy5)y0IN?Je*%x6pO``Y^ZxN)3^ItQ6EiU~GQ#~E1^=Mi z&dkOEXh+P-{Qt)2KLY=FqfN~8`DJHhCT1gMV*i(5;v{Bf{Fh>8A!g?IFN%wpnd@JQ zg^BpH{UgA_@=yDhVk2f@gZqd2?C7)oOR*EP{+F!o{|=0QtoT1q;s3DnU%TZDtUlxT z|0nJ~W5?9~)7F2F{P`2;0;f&K#>PoZ$HKuz%)!L*=|3}v?q`@g7+9Ja30a$30zNxs z5c((39f-M@IsX&&4#afKEbPP#!Ui@H05j9y4#b?yY;X+9p97-ykA%j5bJ>}=KE<6( z)y<4QRooWBwW0pJUF-@+tWLm zXZpvc|J&5C*5>M=B04Xi<9M>ZVi9{j(cz4m>LZAX+7o&76+D2V@0*$^EIEm;RV57N1{RXKLTnRv7(zTU{Crl+p|wf!I*fWpdlI$jM#Jinp)&U3zD zEF(ozmZ9$0TDM7i&ZF=!);2-`z(S_{h-2y>*AUZkHM0#l!hOMh%j$XY-798nl&ZZ7 z=bq1ND>bkIy4-Ps2hHNe!N{evy|n% z6mTGpEe%s%UIJ`t@LQW$JZnAjw9+*@n_56hy?iwtkeLQ05G;(+zY|vge$S4mNslHw zyj3O;h~Y^B7e=CP>R%~VFN_Hub%s0k)UI%e0$SN`Gw3O^8&a(1+&RJ~HS=6jG7LXU zgg3pUBcmLS%2$8&!rTG1^9PYHUpioCNsFc>Pwcp2xcdpf|B=?DsWccR*D3fZ=q1e` zvfQ|6{%U0`Uxm9mi=YPn5w)n-5UFb%C2_DMv)b7g2~HFNiMw;ja>`767AqzJ`0b27 z)lfy}LBZ2|3>YZ?{S;y+b_`MM&IB>AUsXPWvxkxkU11szybP+=s8>Z~qAA ztm}`M7EO6m)1W&4Y3+yhNJuAVKt0f`?d!8-&&p7kC?)|d#rLJoF?Wb$mVeMe))mZ< zy0UNdPeAPBU-4z4`7Eh<4uqd2z$JK45ZTiUuc^J&E8KqYlRuu{;BYV1IcrnGTXzK;!JcY)DGo@z%j%relMH?Ob_)J`Ic^!UA%v| z>j3k-J^F|fj0DQaVECPs@Dq(u`#4qCS>j&!oohgKN3UY0A&6-64#MkgiXFf>Vm+Xw zIK!A#w-kN{efsXMc*#$2hr?I3JW?-f$5l*~9Ma!8W{lLAG9r9$TU(pQaI7U}6t&!oks4vs@MrRdKv73HdU8x0@)g<`u+mXm zHy8KkV5%+Q^omQ8u07H@{2VymqN^{~*PL5J7#3dM_%O^bCL;+Mqo* z_Zr6T@)O#VxhhW`E4a-{5q8K%NXtkJrSM+V!kRYhBqK}Ej-k8w3rdj%jO@v3|Gj-A z{9~(2H(OswNZ|K?pa4KVGCn%)P~7U+(J!1MdSy6yDHYl4+emq2UY@sKs{LCl^<~R1 zt8e4_9To{hIf-aIWkM{C;MWJ4KkQkmnHr)C!# z6gq4rc1_W;4vNO~wkY(HtC*3A^EP1#v;8J)DYA(+wM+3vY{NRACFVnSTjQ?B%q!Mx zuOtRhD48f7AGt0m5Px*-D!d!mC3^;*13fZhng)&cEE72GQ8d=CJAP1t)hMl4eOLD5T_q zrGEh3Hhl~4ka?@#U!9XWdA*-%IA7lNCp<2@Q##u^RX<_FaE5k^OC2B4`Us@%sUw19e)4@o9%E>5WO{@{*drtgr-1bl zmYLq&L#%_1c0*kYi;tpX?}xg&O8fM>N}z;hbV@YW zuU|pw9(Ly<9Y3|_OEb1SGvD)YYn~ch?bCRa`iJ*l)$E)F^&exeslqJ`z-asN%+?1_ z#X(Uu!liZcm}r(ds^6=w)Rc*#_!rih(!Z}{dPNQdez!GjlkvC?pCX@lY;7yeOI0_^ zx_q2hb9E9ox33+F5sQVXEm+oAD!8fUz&nwRtQ!=i)DE!FwMj0ZrUy|=u-^R+gXdm7 znW7txk*SYT)U61pZWL_G)}4`b&ERqV$}HBhHwHPwE|*#_WUkx$HP~mCFOFqL@YUqsScB@% zB<$T%lA;_w{}BT6!QzBVuh3l8NY%B&rLV+yl3!l#snhYqEnZhXBPpHRL& zMaHC)J~h~6L30wjUcsr3>dcPLFd$P&7TV0Ef#{tOL@+D;IaBC;6M=jEwvnp3RK-F# zc7$_C7NWUCwr;t2HYNkpoH{j|tf!`wmsz}SQEITjOqel0v zz2DqW(b`=kI^-;jk*yhZz8SgiXG=~kxJv*&&TrMTyww_9H=4fIH<%C(V;ovuyK}=} zlI=e6MftIQyfnD7sbo3k0G6Q`BPsC4DcYY>os^kx8#wzwG@Q6 zT1Z>SxBqqjc@UaG;oM$h^pUgpl8Asmm71=jk=zmU2z-R0zuW*LB0)?I4HRL63}Q{; zfNr=F7i}6-F$vjOafuWs}^6k9~$ zwQ9}hS{%#N%26l_Vg%|cvv?zrcXC#Tp=D<+yOaA;EN# zFq19S1+BcD6sUM?Km%Q+1^m$SbQ1{-6Kth$^KqeHy-r_CS>0*o-?E*&yNFA{&%XBq zyFly71y?#Hf%QCnQ(~@;S+$IY}?(BI%1bBYum#^H;PP-5k5kYe;xCkVe%n*I9_ngutjQ?mL%m zPd+CB7tNPHQ9BjB3BofqiAZyVb0$~)9*@H`NJZ>WM z^XyiE_qq3ePVy3m$3rmhNWkrftH%0KB4O{u%i3HiE!d60<|wNq<6_)mY8iy;FDcS` zq?+8FaccZvnm_SLDZ4wP3>sbCYq6tpf1t!DlJYL+l}bhSJ{b#24}DLmml!ihXkw%1 zohEVX;KGU97FRjyMMWKJu?#rla+yLBqQ zmFcD|apT1B^`_VR1&f1rwO6plaVi`SlZ6Uhao3UZhdP-SXW@wZUTQHE2P>bc5wI~YW zB_!5^M;wUtu5OQ)kz2i2uy+}mnFFF?HbH`avXk;R$z#wFSSqoo`Vfv%uB9IAo13sm7;Qb3g$$_QP)=#*tMoQIBa8#Z9ayZNCd_iYj675`K5sPgT;?7m z@Y0u{@*)~;_jph^Lsl7K)B-;=P^(8rui20&6)e5uEue2A#B@b)P z%a&87I4AQ9z6ow9)IL|-D;1>@22UN&g(dIRaER`%70kp&07*s!mBEUQDRpbTUwZ^G+vUZVo^#L2DqZ7 zmNp*NluAqmj4+#O)_^n{BrQ?1$YpEfYS#MCx1A-OH7{zcj8-1m&C(XX1C<*}R%BMj zR%lm1E0&Yh2aonM_S;-fTzg!1T(kE32egx#S+xYepdZ1Gd^m}T6p)IcWuV8PX`w;T zmU$l-%pEy>y4SiRy2rYOtz*ubzc;-hi=qAGDo*qdvH#dtO%i;saYhb*EJA}uqEBWu zzI;?YSt!@;G+1jv)Yoshy<5Oykivq(Ku5q$7Z6%5ebeMAY|nX1DZ29Y_$I$nK^HXf zme7z~xz)Kl`iOh!TYF^L-X@|KfG_2@Q1D~Ohw8fa_2IkfF-5I$9K;sHDNP@^7#ksrhZxTY}%WLPCM{jmzcqLp5+Er_S zt%)XL%>#^sgG2iKIGaO1SIbQ$#uN7ZU^Pw%CXy^vj`WN$*%4?ytuyU%&!UpxM5nyZ z$%A73c*y1F%7`@56#hh4k}}E0FlhP&u}a#v;qsBQR+L$@cV~OY&Txkm)X?O@6<@vs zy%+7615K8iZWmWH-rTGXn!_ZrGK|Ze>*9^WjZIz|h0&krX5Gw^->s8Iryx5KR#TZ| z`g&}k@sS=gR4G1&9*PUPfn-a8P%1!AMy@H@E|*-P+-SXx#2P5q-FTna^}IP~*KwHi zOd4xm@p^m5V{!hxJub{&Wp{mT%I_*Je>h*wivOFK!sR@0{jT)ubpQ}0kd%FMLS_}! z1fyR8O><)+1{Juh5=AkBf@`6?3hTFYuer$oUb^X^OSvSMF!EhhWXze6y6Ftx9^v? zeudgVU4f4yqPD!W!c{$87JDh>6$oU{TSsYO`?W$K|MDWQRqsQT)t~V7s=Jr{l4l(% zc}5uSY%Dy86@pdN(6PA*A51o{40lr*>6!12_J_0Wy?!Y$A>k)Sf6zj%ND?2{tJWBq zi>RHT=L1-&14nmH@{e|td@miZU#2>ieBf^e%9quX_TK<{OJ(*>=Sw$3iF79o{4N{e z4c8RjBkox=7NQ>47Ra4%s*{9+%+QuvA7$e_7bT_o%Vi7F)(_B=44&YJ!u!-g!~~dN z*5Q}mQ?|*pL05E0Cl|)~BOF6?25IHCQQy;r`Y$7r0(P*ZSSlp62tROrh* zNb$a7S8{&BJS&NIs^jIL@iT|ztB@D6KGBBe(+rw?+imcMK-ekGr5B_NSuo66*&qap zQ`m2~8rDE-B=ayY*4SnpBhu4S+_`aFhbZWlr|=ew(^}Z!_+0d~dF*Fz*@GfaB0Be- zu$w_?_M*B5sS^G@20Hq5r25hlA5TD)UfVt00I^9a4ixRRGJFK%}HGbe7-H9w}O4lr-E>!kI!Y8~sXa%$n z(%I(lZTr#M=1hl>(o>P}Bi6t=<8K!a%JE<2Q%ZR}=J5~Z4pyQKC6On9rJRA$p{&w1 z#Ez%~JzVBeBakhCAeYM(fvng2Zj7`Bv*l*{<$5c$B!*_wrDguR zP}F<+75nSmOn^HYs@}%^K(l<5mB7FO?fV`B)<&%dotN={z6Id6>^Uwb#^chJkJ zct5{ru}!rl8to8Py>nYha6P&@$_U4!>P*dwX`B0d!GirVp6gbS*xuMIs38F3)lg%8M!Q|W~(R`5NM7D0WNH_>9o3hwCRHfwB!bXQ)i z=2s%eH+oDC_*z<;Sn=3jr2fN@+maN}F7!m6f@Lqz*J`5MDP=3Z(n((F7X;`RAxO2* zDS1{o5+^j)oXan{OC9;@dHV3n2OjHaXgC7q_Q$);EQfC(08)Yy>yj>Vhm4b;yoY5^XBCO9`2h5J}SutIgg-NT|K`i9m811l4ja9QFg zR10@yUxM>xsBY(Gla1;7YfR<&%hHef_$e@^jJaF!*uBW+)oVBc_^h{95(Z^^=tYNV z2k-;`LQxdl@#b7WRSlKR*T%{n0p2OCc9oZ@-fP+v@#@2@ovxBlijG{Z*NK|A?yCt< zqY5}>oj+nanP-;UIy0y4_e`o+ffM8W&$G>%hYp9%62#Gmp$(32k1WlaZ232^d|aa; zF4vMn`58Rz&FpsPbZRd5YH1kTX1y%y|Ip;dSjWyo<%vsCNWl5w3&lailkzZDu%Sv^Hkbx z?UX0O?$Ue->*he^*yc*lKdcHkxkyZaaYUW|B8XCx2>Y$Hy17?kHZHKpCz#|`_fzMpb3PQ z0~uE8fve|@GcOnG)t*s<5Gza>lZi=x2%ew++%NB)mDXh#AdfGaPd845^NHZ%i8|{I z`zD-DN^qh3*fk*CoFMBGg;=_&>=G?yR$-!CioXzwhXK&6((+d{z1isC239(OhA#L5 z)mDbPLr_SuFY>FlJol7HH?)NAFOeZTPO^NT37Dpj$}VAhuQ_C zL>XM!&`%9m7cN&5`x&apliml}_nzYYN04{G1mODIMt( zsw9JwP^;d2XF^`!@vxJ-+E%trzCW3Vu`s0H1wu8}GzB@36`Q|pfBj~CsE?&?G@iW%h-$rk0chq_dbZ>m2VZ-j(oI2)E`f> z7`@{H_2N@87#XOpW23PvQ&?8+`vhB7YASZi6s>OT8k?yRAYwa7B{viaw;xv{^y@I2 zmD$LFPqFBPG78b8K&2H*Qp|WL=Q%_Td}H*Y|B z^wPVKY{CypZ%!rYbS#K^ou5OPdcd|(_Dzd$ZR=Lct<<#c);lCdF*Wq3W#sFV;oA!b zg+xth6iMumM(0ESk z>OyFxrxcIo_^b{n{MA2%pU*~x_LMf3-C`ljxpBu{PcV-fvZr`~R_^o$Of?{99)gjw(V zdkXvQ1=izM)Z{hUSyhtr7wa0Lijd7zPNYzgA$q(`KR8_*mL4huMhZj+08;cL0r%LM zPesde{g$b~bWo#rxBlBLzITE#_H=yP*Se}&)%6aOMQ$`r?jc`D_v1yLVB%yFcy`O~ z=!ZaADz3TGnz}E-gy8|Ck?cC^2%+>Sph%C?o!$1>535;Mhhpol$YwMNM*@@;bXVIh&Eblo^|OP=BTxCiwW)t-en7 zLG*|P?#<#f=H@D_O6xX{nqNa4CYitTlD3U+M&Egedrmze;<5^n5t#`qIq{2+WepGM zRnO@x!f0Xf6qP?AYDRC~$&AujZ_nuIu}mF{x@1{+*)Csza6qsMlZR=4t%GR|yjmQY+{!J0daC(W~zBQSO2iV7a{hVO_W8 zX6|6&{X6+--~A3{*NU?*Wd=ye2C2)sxQKH)3sJ!rp;_y|hF46!s>5A}dr2`jeY&!(nfETB!HPEUypBWjCC zh9e6+1JKzT0Ehm6k*F=Pm=WGZ;F=ZQS*7ri#kaX3Q{?RUBooROBxyXe^mScc+xgIv zCOr2zH@s5HqMYBK@}N2Dc!}Flm1>n~4x|aQ=9<;GZW^5lml?(n+b+vq)IgAL5)A`} z^z41SB>R|0$TnH!KhOhaHvGsi%z_f9Fup}b|B^&wGaH4^lC+4DZjU0m@Oa=1a4J)= zD~P()I{220V*383z>>Bli$WOc$}2XugA$vP!iAemy}wju$hG|^EM&Mhfs=jJc=`-( z8~RzE)%W{7kl#U|iJ5HrkG)Umfv(;#R$CFpH&WG^fk>ka`}~bVmpUUa_gjtkV&`Hm ze0RuFL%6~9U`YOQ9R7-En`J2e@_-xmpCe2=YWRjxTB^&c3bs&U1pSBPvOIsIx6) zHL^koCSo(SO2eJ0j?emQ;uM)^r}kTO6Ksi(n=G$J2AIFyjqfP7OH~E=6GeeLgqDz6 zW_R!I4zAqx%_rKA5Ac|WO=XbbQg}1^c}c>e%*Apy0#|d4stuLKuhn?Ox5x-ItQyhk z@j8IfFx}YvEvcey$WCE>|Fz6-kWy>=l#h(M-oF;JjvSE;IqM-dbBT!;jqe`oSqM5w zSzfX0mvK`Jj{$55UqLPg9tUY)?K9fQCBAF{!lb~*z#gzZs;=^yFqZvkmLtO=W_t^k zx5gN(QSTLKJ$=4lUN@)Z$veC&w#f^A`v=h8z9rNXsO*@S_-7oc<2yao#FD;WQ!m$p zD`}m$5SQ+dn*_Fp^RjlA7#82mJ!bO(wa1RFd$jxTU$ZbYzf&LrPDZVTSARuJvv2{* zb_|mH9B9a(HyAmrEl)}3oa*B=0c0Q?jMRz|lwu=fNhJ?Wk2w!vN{sMA`vDf&Y@H4r zO#{9sI3q7-mSiT!&0x-b5iP|-6bQPQ88|=2|LSsCT?cd|8Z8kgvBo^Gff8+m`AyPv zIKgQe&`a1ys$;Y#SU5-(w5*64qs8&+Af=nqBfI9wSh9o;=(r|$ z(z94-U*EoTH=ah-k0g*9A8^VAh5rUeC^ zhkN~Ro@U7z4%@x^BDcSs0_8PX3xfRR{XTi`SRSc;WSCdaPXW!CUi1zuGu-YJ(t;mm zW;^odO3z(~J@>Y!u1w8{jqLZO&Takv$&05!4Z7F){A(L?eH=n4AS0J18(#%Y0rLSwJr+Uux1Z{e~vy#)ou; zG~Re!`c}_GN8nXf^{ER?mEcTMfboHUa6PYHq2 zq`smK=&-Dh%*64@j-od$3y&=QfEf?zk;AbgPKNT+Hg#gWy-{QzD5v`nXLN_ce)E`e zOfao3Pbg{?g}$%uMMcorGt_*WM~Q@sFO@S8@f^=sM_^KjUG=0FBQoz!0fD!N8R#8@ z+GaFmQEc#d>i*>{1u3C)tMke&kUCAGM}Tvqt3|hEmVz8oWx`Sf7UX(_D#SXhYEpjX zRQkeU<%E^F71_Xjv&5qY&3o5K)6L;(v>_+=HIKlGq3(m4;XtT|G+R>&H^rf7pK{McJ*3pw#d z13Lc*f4&HRwW0uN<{xfvGC)}Ef4@*_EugaQ(b%AbYk%CjWJR_NY(3lBG5lMBlyv{V z@Pd1CgW{w>NcsVu`EBQDixZP)i0u!%{ZC<;a0oHG$=%Ep2q?r|l#q&i;?}nr9tf;~ zuY=~3=0w4(seq`HIbeYJ4{Y9z{)-RO#5w%v{PBgL);S-F2a0$3+d{o{59fs{MDF;k!3sNsS9l|mS-5j_BeE#F)DLD}Rzw9fp zO*6k2rb+jE*ylA3S_ZlkNfj|NyY>|2>L0Yvj6}X^s3VWsXC6P=%+iBVfSsv-;Y~bh z7DBs~iCr*DTtcFeOYi@&uK~}iDB?vBpF&df4NYWMoLX3)`ta}@4grCdrZ_a<+Z$xd zCWTe0P1gqOmj`HfBMc9kyy3A2*uj`C@qUsUgQRg<&3s}_#J+hx#d8>Dd1LSiU$>4n zkJMZuG#JGQI=YPH*_hJi+rbTU{fWMq!f#iaH*!1IUP!q4=d@*3giOy+ufElcpX;pJ4ir*UYF!J@8FSPN104k7J)o> z~OcFG_)uKw6rBlwJ&Xcahys<1TU0Bo?$0u;cX z1y&DHal@iMi(Cj+$e8F9q=r8qDoZzub+>^(s=tpQDYR{KzOi${gp!Pw3*TW^X%0@0 z+t-anl4!+%&wy{(+6|{H!S(86jHD=>(70Q|P9=1J!>o6%HGUnWzs<@$NP? zpw}tq%J@6)(NRRkfhfJT`w56wyUaHa#-6WqCTtrGHxawzXOfkENbYpl3uzdu>{D0oq8yav5K!GD(nr3;wdg_%*8AZnKBx%rBZ@AH^=vzLG~ zV2(`=hIzf=)?T<*CplX?T9`5Y0XFbd?;e&AmL< zMwNL`W zcRknJ;XJ8O?=NX%v>06y;nAz?pj*jXh#V=o>0@O0EAhTjcO$Lgm>39bFV(0}BQNmw zN0M@i6c_umoL>ylV~ieKoL)g5Xf6Jb&=&OhdwrGca|MeYSM8qpcm-XJmt6{#+3U`= zR@kpts-QB#Sux z^AS@S@ z3mLAyn=dPh8SQ~EUbW--hfbcNzTXKM73v5eK7oo=4-c+BBl73vWVT1ZPa}mlnhtmK zPJ+PAO+JP`3{QtP804PTrDCHKn*uv@3XFp32amDhoz|%fq*Zy*99u^9d-`McZ7Ig$ z3ICbMpEij(Q>`~tJ`~M11b#o&ag#0!T@Afr!%uAE!Q+z@7zGHg@ zs&@t9o+;ZTjsLhhFbP~Rm^+a#I8+YhhriPFOH^ZrbG`vH_+cRu){NbVt+1by=!SlB zx#pu3QKVIu0?~x=`#|giH(VRkS4wc!%g% z(68#b<~nDuytLo3#hK$W}G#hmOjBcl#qW!<{2j{0#$vC{(k zLx<(f4nuw39OPY_ zo!<8Vwz+I82&xGO9P=*=K|SIDNryYPKW&&GJuPg?AtW zYw}U?8bhw_ZK&ErvOs>(k6pquA)sfWHh=NaPZaTPl>G6qa3OEzcWz&CS#B7MmJ2@o zrF@B-7LKDyA!w55rmU4dCk5k>733$furtZ90Cj5jrduMXn<{o{RB_3EdqYFF>>-fOSbU9~^EHa&s(G%fih>3DB$!}44G_djNf z@-!zst!kpmZbj-ktfod(fmaB5&yjY0g`!4kCwvxy(*hD+)lJpoGfOOU<|(7HX|0# zCpnN20B>I-Urf;Z=oow%N~_&*S%-Iz9=`Rtz*Vx9VoLd^v>(ZSxW*Lh9Fl8(BuMDF z_T^dqmBMr+oxpSmdbA25DQLlY(kxacnlj>+H=0kWFv~bc5a(D@cEC0c!B8j;IZZs0 z=gOlY9k^W+(gL9~lI6o*EXusAfaD!JFEySRQ5G7aJXoiAQN18q2$1g2Vzk=sHZw@U zD{WqZXZgJ@8zi|P;&4`&=2PP&Pt6r)rW0>6ssr3#vEaxyf!QAufcNXO`dR$8pbPA( z{I2|3YHdt5|L3H&MZM6GImQr!iO~u;Z=p{Wow?{u4fLD2QyzPzao={#q(lTC;$Afa zJ;{Z{%i{DcE&Jt7wpk{YlMM*yajB)qLUO|y*F?8u+fgpgWYLDIDEAvjf{Ce~zy-|M zmK2`H?XBbKZWDt)Ed&v5tKW1q!ym_^Vun`xj`Wi`W*^wf4UB}-DL>)1io|0gl8*pwDG)9H zWgp5{&z9Ww_bidTOI571xfw_#dd$(>?Wq_mjHXdF>%|$cA&)=R?SbgUdFvBzdx>>} z9K+@!1I-$57%CAeyy{qawB~8FYcKW>%Wo@`=-3472TQ8-v(a|M5#k;;TEN;j1{0UX zPHfi&2%X|Z?TSyuC2I%O@ZRliPghmybG3;a7h%paaqq>-qE7g>{|op2!FAE(`fz z=?cU7EG>a}N@3*tYr=_@wbvHmu0Ml@+T0giCdWd)iTl8`tGQ@z4b^Gsv8G8IJl8-D z9ErP88qCAPp`Kpd$Q=dos)n#CBG4Js;{+D4YW@_wLb_jMgRu=?QEN!g>u1zM&co@= z^5X$XC!F+pH|JVA0!B6O)XKOnWN-kA97+%H43iR4@>a`}Q4ho^8!Y-X|CI0C^6Wu(Dw;qf zRWA?uM?FOT_M1MW$1KvKOmE}23)1{#Kk(V;nn}@huO9M+YQsNS^z>}{GZw@eD0)^F z3}9|3Cu%cd08R^WHIyB17IJ}i^Mz}VNUacbjY#sQJ^ckiSp`u;idW4T z=f-G?(gIjfN9v z5>L&B(^ks87iAP7ibbA5isZ8lFp6jqx_UdCNAI5=rtr|^RXTOezl%?p<+y3SeSBr# z)SnsQW$C?4;}oZwo$HBdv6A~*IBjY6thK^i7t`35#d6oQbN#Z%+dt*{%ocQUa`_XH z`OT$nJnYH&4qXj33{29w?0MaJ5mA}qg&=(qWe`ECTl&L3L#iIYXtPk&{&@Dt15Gy< z5;;PO`4lfIyh7r= zZwR;e>hdT!nvE~J8ACyY#ODl{;_{X%pcXwFWg9b9kh$}qUDzz|YFOYdZm2Vnze0Ze zBz#RZXmoKF!K_`e=IhFGYVj#WEI0 z*rCBh26(O$W$~7NOsgQF)L78HbbZ7Ubvjd1Q91l@Atx7mCYF%4vg#|3igO(C16U2B ztG&1P-qAu`$@#sAe$e zNLdSF;dKkXk0rh>r?c^!maYo?U9!B|kq_oiM24^jfEU9~XLSg5(S z^>Q};CVV5FERr@YNjv3iD*c?6JOjXNt8_{3YVcsh^<|v0<5ZZO<`S`3G?4YT;U+C) zkuIi3({|C*<9Lh9UvuX<^sYBXj|Bp*TLZr!+*?b=$MM+p$4WZh7%s8z#lb&dM=iE5V+BSF^*zAnd`etOtvl3LBiReBacjS5fV#nmG-vnI^6m};nmKvdT zLX)DPP64o^(Tf0C`RIHc3q-_mGN8qO%ZZJLG(kR$#L!$o%M6oSUDj4g;ihyoaM^k4 z3fLi*YMrTM6$eW4-I>ai$RvxN%%bnjj8_G;me8|$5;l^YMvP_yV$A>~Kb2hMw+g&m zWoV1yGMW4+{jf^t9Rysq2D&cgSCQ(6XPn;r<(Rge+D7F6PoYv!s{9!D$93el>pa}x5YydJus2| z#`i<~BYIt+%>iwirgBiD{E^N^ko&Z3VJ4eiMlC)F(N`$Cbf}N{^iH!vw1Rqb00Z-~ z4R@wlJNLx-Uh))}5^-rnuA2l(G8gnre9usNIfhYP3CK{QsIaQ2Y{>NSG@Ob7-9y{< zfR=)$N{BVvV1Te&p5O__z*-yh8(mqsnJ=R~8f%}_lJdqL(;zy3_TnT{j=arKcU40d zYEqejzWG5MGPH{5L&Tpzlc6r7MBCFgn~Zh(g)3S>%Mn7M-c=66>0x2ZPQvWeOLsF> zTG`PaT2HaGFhiYY_|BQUOB1Ry>*2GH4;-_z{NS&!1;~>AL5jwM#!{Zlh+A-{@BE*x z7s*e`W}g^n`4XRA@)}cuk6ZxB(WsqVDd8+mmR&)#!TttbV$5h`-4s?cyZY?E(n4ZV=PbCDUo2mujUE z(7GRla}V-%ODli43wxzgerO_Xo9&)C*%Ju88OED|hh3zZ*><|jc}jm+L_FPih29*H z+II!KcxH2#(Wh{3WI8_c3NVjUk;?th$No6*! zlw<}%ny(cp(%c;5*}A6kv6lz1(%4+8-L(5#`LD}Jqii7LqRe({NMX&{@$W`AZpzE@v^QjC4zkd7ebL}e% z@JF9qH2^kgClIP=4kc9rB!)QSuW>}TY*Q>*7(|k)SnhTy%N+1XS2%=y&J0LMzm28l zzgg6&<%S-F57y~!l~d0T&|RTC4u$HP8JOp^h$4jpYUPk3VO$Cs!~jhWhu*q2HQ_=2 z4xN$Iw z5sL%pT#*zA`PTrC?vY6Nd^rY-81&RVC6Ts)5tY%i#u4^aeu*f>{RRnW+0`fez;pcA z%`v#$(R5?g4!^$jvE)69aU1!%bHegO&rXt?$~1H!WDzZderAS>Wc0F70{lNQhzu=1 z6Byk(&9)` zWU6;R5XIBPpdl3ynPY!n787q!ZM+M;xgi9{?g0?WL(ykgM382GOE?kxP-qHS5DFme z_`>sO1Oz@P_BjZdg$P1ikbQ+8pkMqF41jvqPO~#OoWkDe`2kyOhu#1M*+9k5PdJjJ zt>>!@(Gbq(?*f#g?W+vp>*9I6zTPKO$1K)l-WQgt#;gy4WD-noE&yt#j3n`+>zz>v zLG_RJq7P)(jc>m>O0Pr_*|G#rTgsAu(%=N8k1Z=I0q?P^?^@I7Z#Q#eD_*?PsD*Y?CAHed( z4)}kC_G>3>hON`X2VT7+w{7T9M-hvz@zGOjLbe{mNxyz;VulSGVu^B${o;OLJEMKE z&DpasvS-Z{x-xmPhxC~bqK!|d>e77B=Lwrnx||F&TA>Z~GsOj)4|QkSESb;>Nwe{q zm8*(tvnQ_A&=o%x64u+eCgv^EG@cvGx08v->2)UNubP<>ZoRWUN0f&a&%@eS#64&p z8&kWPRT)SWvSr1y;m-Ed>v;%$xJ?*N#h9pRoE78NH=ODKd%|IRGQ9WK;XbDVLwDPT zKlN1Be+I03NPgd}5&;E=a0-nJ24Ml0Kp~#tl<*YGnQr_}JmV=MET%`2po8z1TkPZu z5|pF>3<13=1O)+|B#%Qpb8y2=Oz#v@=OMAiyawwjVYc$iPlX1P6xdQ7AT<;g`gP1e z*4JG4Izs-`q_^*`{zNKR1-gw|;r77NOR08n^Bp3>sk(kXXk>(yLk9ki`f3Kg^(X+_ zE1p3TJsG$kYJ)Q(9Qy8no^}pgB$ohOWfjfqj9I}IWm?BJ`~#opsN{j=EB)92ryvXY zm3@{`0bTSh@uPa*88@A;L>t)WNSmTlH7;YY+Li>%abgbz%J5 z<)13HFYL%y6WiZ2{A)yQe5Nn@2qQl0-!lWgI{qDpg@yG??n?*Fe}?);=1cZ(llEUS z$5;8=1;G5b{B32X|EGMlGk?KP82{0Y8G!#){!!y=pnucrOXzRK-%T6;f3i&e-j)CU zK>sf+lmGe*|BYnwMFRN?5cwC$gyo9`@fxee`t&|A94O{!dsF0PB~d`VXwhKPUf3t$(;Ckbu9G zrT^sX{#WeMzjI9h04BCCkNYpz$M|IqUpoB1oL@$VyB@M~6R*!zWo5a?cCyvx$vF&h zjEn$&nn5_9Q9Lt!tnWE3e1fo2NSGDlcUs>NMAjZ-0DO{f-f_$8VQ4-f2L~e)`RT|= zI=>LfV)t|WnR4?(+vA4g^bPwFYHk2LmN~)qSB*1$dI6GFS3S^L&{;<~yeve(2 zRp**^DC7!_SzRHU#VXR`eNU35w0l2q47l51wY`jFEtD7;4&VaBWM{MfeBsg;I&fSO z_1@$a0~ar9J;zTq0P6KrB%`GsI$wbBiAp}#?4Y6Q{LTK>oX#c5QzQ(KSUZ^}7Mq%- zzYq_CkA3?w#wE)!3Opb>6cf21cq8T;K_XZZVu@%(<4xX3e-G0du_1{oc~R_IkTvw| zybt35%luR?CXRv;bv7J#vX8YRITy1gKy5@)8`rf@iHWZ`BLDX8%dsJPNk%uTsd)59 z1+Nu&x}4x$tb2i~^tK?N0%cipc%jn>*xhfe)KZ~doMn{6QjO3Va1Tm>53_H7Lw%Z% z6Fk^UEK)_{8aoM~?(nMwA3++(VOb)Fj?o!P4~nth2dfLIAdroBEeVcFh!QVWY%Y*N zOm!1SI0dA82~r)?KNOfrYG=J9ND+dMXqwS|a#G76x{Ad|tS$7Hs)lBt7h`(Ib`jpk zU7mDs-h_AH9y9HrW~zR}&!wopHzo2;Iv#w-NQd6Gvpkz7^S z0iFY{v=*{Pi?J?%;dY4W;DhyJ`AIlDn!F&&bS%e_Xp{44#KIEUNybHdj=+Ol`h;qrZ8Ti^&Q)LR4H|Gq~7g{$OG|fCcV}Q zeg~mS$ort>Aa6*v1XV$MB1M?JPrH5gHRs;@!pFkvZyfL}UA|Mk+4Wu&*|_9UaJLt- zea9QsDPA`pU5;3t$mR_()!EmRtK+jf?-v>k@riD&rDT`^&7->KoOzsc9v9xN58RKa z8rjvO^Q&`kenf{riClFzB{L+B0HQwCJV|PJ&k*iGgk2$qz^oGzZGy-o7l~=HYKh_l zO!qV{*$<6YL3Nww=&86VUaWx>;|7e2-`Y@Y<`Cr9Kir|cy1~8wz>>dYmt5OBfOW#> zGH8T&sLpd}$Og||-?2Jzboy5nJkCwt5xk&#!_x3!rF^#_QR8Jvlh&VKb!duvPrB&c z=6_(}9E++|EOD0$fr7{_rWZ;fmqN7R!?Hn4f*j{xh3T8`^Xvof6W_7iLH3{_HwlLt zAldcbg|&}<>T)V{;{L#m*GYnvyplXByk(3hawyWGoQ`!J`>AYHsDM#gp~9o|VVo+; zWDa403wovdR!cvm$KsjxJYal>3~MSM%Qkxh@}dtA8SECT)XUMue&aelG}WISw@r3N zV2s-o(#1!)bFPyz`Cd@smT_K`UAwn?wQ9Uiv?kQQ3%dh$f`dvl&4*cON>_(w_OmW{ zvBuu2obNcBitt(|iwz`lt&h(gKEqF_${Fw2E0P08niGxDCM?|Pnq2RIkFU*+r zLG%67aOo#JRx{@32rok}!nHw7f^3cvk-OvieW(}Y=TwQ(EBs9mv3!acM$6tSR|ML7 zOopi>eXbw^{;K$GBF_@&P+byTT3UkQQQ3X06MVKZ-hvZx!=|T@r)DMPGENb65scMP zd)9C*KZIM7mVNk#=Pg_MIp&c#9rQ(_Q6a|24Y1N?g2kk@v^FMZM+;haw8syr_StN6 zDiI;@%de#%JGMB_0b7ywBllCE(b)`<*reNY@RP-z^tdwOP=)t%^K-Z-BqwkVvT~m6 z?mw{KRqg77P_+cN$hPC~NfhpZ6x-9D?5HbROw zfKIIk-FO0O?J*s8S@%gCA{a4+!mD~;Y20MEkGWDe#G3NKuL>^M7-1CJ1Kw~hTxPb; zNjqeR<5>hOa6~eBh;;qdDC%`K@zKEM?A?~4jkr>SSe4>pozOh=@JfS>aLC`lk26KJ z3wS3BWq)^(M#zDW5-!edlM`)5?38XkjFi7Y>!^x9fFHdyUrlxx(Oaage-tccHl1UPh?C zRc@+o9(}oqNHQoK2>R@Py`csmQHyK@t|wC~RMHxxCyU{Oql0cdDv-+Tt+uvt2<}w^X+yt}Tzb zpvbfNILEEw0uFd@7M8<)0mSOlMX!Tk&1Gx@*TklpwfSNtYS2#5NzQDk#|+Cl4A!Dt zlg)B6r^s)!eO6n=n$N4psnV0RPO{!$KCH4y&ufbovm+x7i<+%unpT9`1yD~ysx0jv>D5J#q?T1Dk+410e!Epo=Et$C)~CKj%vC&ZY-wWa1zxILZHM6B zVMDTCE|*1%oN!`qwRz!v;%-NURV?!rX|0+`5Uoi_GK3R=jujWK9UQy!Jevb?Dj0dD zQG|hRVqhM_@fSBdCOlxat@|mc9 zm^)(U=V#40TeG-@O~E&!$KKPV7Nnp4Me&pAdY5D+RnHRX_L}Zi!qDl5p2Y39$fd^C zG_c}j7!B$rW}}a7PY&d95?1E|ncMb>-R)-Q#=J_|yz;CQjpqCJ#gA$WQ%$0UXukpd zdqdJH_5Al!EgYlOG-l^(v(2<3Wa*>O{&W4{s9Kb7d6S3)4Zy38n(S%Q&_|iCxa|2`kz{6(CAFL;WMO2e`-OB@X>rWn)k-} zH{5{FN%062eyA*RJteNF>TjD_{-|{!8rT9KH+Tw$eM-Sh3jA|>i9g^!_=-r-m8o~q z`~zao%Z)opMSxHjA?}%SqV*!c$QDF<=i=ZH0kOFk)S9s1w~7FyDZIjhfY`N|fP|g( z_vzE1!eK7~347gDC2BCQDKrR*>=0u-0u*CA^XreY!hpN3w3yTBAEl+eH<$#CjHq0i z!!F&dKQL+%WBASHbB2GE3UZ!a_T0z{RQ0s>e&QP)Qwp5I6(>_=6B(@_sYM|%?t%%3 zJdo{EE;u{kBoPHf^tKOm2#vISr*_+F$^X-My>+cAwT1-ov*s9i?k8M+P=!;C#-=3k zEPO~&2pEZeI5mcpTP!L(N|aPc0Af`=b0%}Ywy8ys^HPk4Rr{3TVmhzbZRfmOzUb}61>w)kgUR4?i%f%i!COAAUO=dp}&aa;eh zxa;%b{;#n`aea@22Ohj5$j*9G)7r5St1IYkIhwl%C9r_Jjl4_c&4AQ{^}vfcnKPN! z^g#z>J(Cz^4-e6@`oO%myj{v*_%pr7M!e8DGtpDKb_;2NDVrthPH>v>eD~=<74-^k zJHIL)c?n794BCU@a1WTY;odV z4<6E9t9u3E$tcue*u>Rs$U)}~7m9e&5zLqaS91=by1M!n6}%E8xK5VioNbAX!v{vS`lZoPbYTUSku`rYS z35}FSE=%vFS!zjC4k@^&w^8cDI`(=tONJ?Jj;HX)Lb4cQoby>15$h>f4w>kq6$$H! zqP-UDc2$LF(C_rN6ruZU(XFVpb`RjyMX$%uSGk92Y5FFQ#A&j=??{o8%r>&j&dFq$ zx#6%e6$?9?mlis~n=!jF7bBxpKPsu zb8GWj^by3xa>y{&8Wl!8UT(JgvE2H&4pVX~Cn_hRh;QXf_!#59EMLW=s3)Mzx^&Gf zzHCZ{^2}qAPSm062A3X0%1&uBRBW($3j0b7(LE~O%eGq#M|XO>WN0#W`y8zwRheh6 zT}_CE{16!`lnH~qivtF<%U0`FRL;4~6ekuh;zYflYL*#iv=hw0Vl#iV(j58SwoX;|~rSZB-V>H#|R*TqBI zK*@-ZOtKCewGCZE^D3-AoHfvLtugzVL(>6Q_N_xyf}<8Y%XdPuxF6V1w0Xl~E&GJA zbI|PP3#mhjm#1?gBhXU`UW2abvhq`Mi`rI$>`+D^P_ZJmUE)mf{t0D0!YXYUiT-^) z*@!)1e09+Iw6AaSduWs0WpzBJAG;8OVAD02nHbBTABT@N>Ce4$(nG30z2=`%w@}iZ z`)}MGZSH?kOyKf5^)(dPxLT(-ytOG;Z=_uBYFDEdcY|oN3aSqK<(J*@ItNpZB-Ow- zgy2FU1wEn$dP&w>+u!{8o0{QpEuopG2HUYHom_)2x8B4b*^rt}+bl+-vJ>P>mCQmbg~_JwmD-c`sJS zpq8q-Y%E$EoES(U5dy(qh()FJQs-dIHlMg%V;^>Eh%rXbBDR{xmESSWqpw>ZKPEe& zF%p)LJ7!>kX3`MqV8{*;i{EG&=6{!#Hz`% z%CQi=l3g&p5N$Q%PfkBXv7P z%$H>2o9A_Piz+Y%wg4F>+)R7KV}572bRMnuY$Q z%OeaE?Ca_2>$z8eX1!!efgo9wfD!^diJrku{3%;Sw_#!2Ld(^qC*o$(8}`ey|53Q9 zzsH^FMkPo(>Ego|8y0zN@FVCCVbd?`=rc@o$9Phn+GoLS)$b9BUF8q zDT%2`&g|ivQz^lsi5)4OB;nS?`J{Wrq?E+P$Rs2j(bnhjk(HFJld8xQ4B85TSmKGL z^oOvp$tq|63bSGM`T_I$0hG-rvsv$@^DNLLgT|4q7{hfFLw1(5I{EZVUle4F*n&|~ z`ay2vgt?pDm3{noLNQSgGVmj?K)n_(7Y`$TFurF%wgvX31sBv8y(#mz5)UW=kbqzg z=@fue#;t`#qCnlm)g0B@hRxaU=grrb`nov7o1%l-At7KlG9Y^@wi%|ADuT$#h8ci9 zuum`Ak0D4%C`dYurW*e_qJ&?!^GlWHn6oFP19;n>wfQ<<6F*&zZOHn6kRoNVF&}3- zt998OY~Im5oDZ7am65=7*h@ByjelO$9()8XzD7nhx@xDYqcklnUg$7iH{Yi|&rZnu zzhM5nyMshoy2Dv6&!M#CC^5L9g7H7QapFCb2B`GB{<1_tz@3-~Jh2B<#4o`5_w4us zStUN`d@~ea#QxSyWz>aBr#wEuXkNe}Yc0QA9wtSS1XrU-fXa*>5i;K5lE3LJtpXn> zkP?c}b%u&fgdl05f0jDZmo#NMPZu!S5`NN&Wf7EV9v~aEm~#Rwo5E4SBt0b(auSw; zSMd?ZAoW(spyhSH(W*;3Y^eIH|4OtnBZP~`%hyfe=q_vNv@C*(F~u~4O9l%K9nE6o zO+J=tpzEP)D=dQFptEy}lYGzTfM2)DU&7`zl~vc_QFD z`ubp75j_!rQ*qZ|ErAAp z{#%HC4l?pfXBU3+=bY6yyz8jVYD|$MKk;@aS%;CyNLSiycJDx^n%legA@M!3y@u

? za15HfjcnhxNy)tRPdxT{67{w=U5+P5oV86jC)xnw>sYCmC!pQ$nP(M?aAlh3SlXP0 zIBQU+P!CZdgS+|ut0)ecEQFrL-q{^=H$@Lg7bPweE;7%vHq%ZfFU}E|-IsP`nIFmT zSI?llco$RW8ud;f@^#y%EXDmybF=$;Q8B^0zr1%Mh-n^EY~)msZQw6QX=IW^FHaZ~ z%#+%X2#Zh_2xyf&>xTVxtkEj7;IB5n5=a{~EZ|}7{!)Xqe-6~oHY>RRaFfGP22yc#IJAsV z+vCu=>bBfL??Zg1vchNlQMw&EhrP+G)Lx`zb$uPQK-xgIWa}-2be(a^76ebkek+9i z@Cb%AD4c9mXmNy4W3G7`YwZV;<>V(BNi);u4-D@Cz#9~8o*#5I5Q48x1C*X8Z-=(8 z(xkodGV*X+711tJ=6)LF*0^ohT3C>GmN|{*d$M{(!OQo#&9gbJZ~C7&OD4gf+YueL z!$ap+7r?>$U(R?chcyUAwB=ob)shga4BO&avKsUGWk;8E64zHzmz;<%+gPgKCY);^`PtP7JXIEr(7zh;kDytw6W zJ6}+~dhaqCj33}=czX3fSKq?L^u!#bj=YG3#GXJP_~Qii9SYUMm7VFxd`KPpV|`J^ z0-1h;M$tZvzuSwoyO~+u`$M5_tI_|y8K?Ys65ZQZsny#LJIKG)aa4{j`+R%z(f?ul zsTL8a5fMoL9L{6$tG;(0KII_eA{D2F#>vqQY2C=P|6b(SaCvry^@cUcwqyHgdUWgS z_mP{G>(?gdZ`~w)vR4YJ6c`{pM|dP*9_Z-oR%|Nz+t*Z=&)Mz)!kE~-b$L&cW9;Hh zjE&PgNyc@~2VPziM(MVtQfRnPXxO>Hi`!|79}*j?IFCQ@CLbUoapBdUW%3M7a`>9) zH>%EMf2j1GXC-wAX~w6=29ca#B&SEuK_dA#opMHL3lvlqWC(X;H)j0p#R%uX!_&MgdR*RE)KU*fm%u z5=}j|L9keSQh{ZTan3DPGO=UQhyxXDyH&jg#hL`vccJJ*l58?SV6c!>GgnM{vKF2N zOs#wFJi&u^ND#o%ZxYD7bKR!d&#Vopxnb?tdJ3al9MAwCBFPkhesSw279?QBdA zWtst$vs@Xccn$LkDsq|}cF^Ychi#be{*)I3)3a0QPQjC9Nfw2^HYY|U^gMiF0s{Oq zOiQ|F>xZ6OWPoQ4uQP8^uVQb+>sBv$-BekjWjdQLbok-(y|r@w+4>7%pS|ysLU8NL zK|c7}vhxr&x11@TI6QsogNu7}d?D)u_IuzRAwa z%z|dMmV&jiZFi=~z^}cb6i&ocjO?U@Xv~FMpqH&{xGZ`Ve;#9WVOl!@2v22R>h^<{ zLi%IaAy!Y8R~8f=xjf1)4!qkAJ>K7LOb^Cy8ShqAT2^?L*)~6W+D7Y%9(|=x(3Wq( zP$wobRchwy=b8y0r??u+^kTG3aV+q5D#Gull^t#&p~|dKYQr|d+=R0TE#ovJ1J47S zHa0d|;*!f<+h?0rTU6uBkz$pS-KG@S!d9PDMg*=*jTd!Fc}COjB#N#RYRwi1OBRaH3gqS~UgmmXhUqmoy-qW<*c~i^WX)IpK(Tv@UOfr3AxA-$Rn=pU z&%x-^G8!)@$=N3WZfO_AqIr^Q6$84v$xdCOva7M{|pX~OJ8+*7a_v@^7 zPn|7ZGWh8S(6 zx4Ql6>W~b)-gA_;)~Q-D+V_{^uVke&6JUEeLgEZPw6D%z&h5tp#s${3lMb5Ne~mLK zv6Gv+{mP0Ubd@h5S4gnNu*xKkk|)zR3TY0J6krv8V1H=cYUtj1S#h2_fM+}?VK;Cf zT-?~4HcbFyY#%QZb#;r&dO^@RU3||iX!b`vA@RGEp+}1hBHOdeHN_qe6#+#JJxn^N z+pyCw7s=F!QUR07^{=#!rrBH~fu3K&eQeM+m0=d#Nc{D~1er*FoM0?%>JTJ&Tzc^v zTl06JR`{NhGV18CldPn7TDV{*ssN3qhS`;-N2b>EqsL*oVTqfS9IGJ>={c+c_s`iU>-bmdD-5_EFePY81c>|;ZLu^E!79MB$ zr7`Nm+SQ|bEq&V)3W3y`7CDh|_12tuAS++`(beCUBY3J@w6k&^mA4tM8O!s_I=IK0 zN?Vm%JoIw9fSPr4O;fpc@U|I?dQC~I2LVz1Dy^=)@J)9kwszbHn-hJ7#iGEDZ5>?|K{^Te>BgH(OLiRoka9{)lxL; zxRTSH{6dVm@8ab5Qg#y#6SpC0rws=yD;%0*Ll)LV->O|}ggt8WdN2`FDo_v>3zK@^ zQ5Q!Tr&-|A&(#mzw;TtEJhTdYAv3mAhBVBmD*S8OAHGdW!P&2jphafaNABpGGhu8| z2^liSrMtD{r7kps>SHPOu&fl`pTeZ^L-u_CB_)Y9QJ3E2@M_xA2%L^Nq-(FfT`n>B zOL`f$;we|OyEyW5_YLZTr*NE3x7zg)Fao3DPj(S$8+7I^qkLrCY*f!9Ft;?E4Go6^ zwifgP9)zc)Y;$~`RIx#|jH5CL)Bug^UfZwpZHQtiQwC9e*!vcJqwy;e;t=ODHhnJK zb5GkPV-)zt%9@2<@iNjM+@Lm$RuHwH`3K08f~r=A!P}he^1wTKc=ctMe-I~kA#k#k zdZX82L!u@bGd{w?FQKMC?-R~GTl1=0fA*1H@iv1X`>GQA&8i~R_Hcg|`4ordm6_`#QM4^!(@eF6%BQ$7ty0^! zafoUTpoYtq$Xa?X`md?qo6^)sYgOCKZ^>;h@t6z`Jwq9q6zSG^bV-7*UH+Tf))vT z$a1A-W6Yq0noxfMR#2p^3JQz0@UnKn895ZZ?=RgXl^RMc==kp-evMS5ElKLF2nh+n zTZI&?0HlQhB;c@K;Em0<#O?k)b>DvefsX*$DFjC_i1naotD6v@2~NsIgH_L0!8+6H zbcRDljqZoX4JPRH#a(+cJm*T={Z5=B)?)FoQx+PaojI8~$s8L4)x`|kA_K_3!B&>Z zA60tawZb;4PP0Vg7ZAHwnN}d0$QscM zd*lGN(6|*$5qZ?lgMLA~;hV}IVx0TMF&m+HchVYC*Y*C;)cV=s!fEnFOU&fSU&;>U_0n2~ z4h!cB4r-*$Z$I_B4w(*7wgr%udLK3D3}q9_(t5pZHfx~qOfr&Yegsts;y%ON`B|$r zFHY)yR9@F#U$~0>%-wH};mMEn!^YRV|LyiD^(Z1PcTl6%{}t6$vE>~{Aw#RPk;lvj z=!VMyjtTzWjTb-;mSFXgX70B8`G;8<>WF22v|(occcU(6YO+U4zszFK-u}5Q81Y

=ma$FN6pB zAD%VyKvr&qi-tMqO*#}|iFioeQ7_{iD(HU03-ZYx0l*wtt8f8hP=Rp0`uoB(mw-^D zS!+T2y>vN@mc0htH{JOVqx;wz6eQ>h@wwr)xDs>z8qs{0^k?dFOeurBUi&=(r^=B@#un8-LixxyuiWO4LVHInT za>fDVdThzPleV*&O^^clcGw`8O`Txf%+N8#3<#JmWeAs^hHh+fyVVokiu<>dsJaszGeA;w^rIRPfV9+^KkImu@_Imcz@l>&*c zVA9r(k33}MREQYok;NzRfs$z23PW}k>zSJ;55<2#u-jCkmL2r=>ABa}sI$ysdCNCb z@d=cZOXOKB3Tc9ch5(Hg2Qha~qLK+4*LLKl|J8uvB_FV+I)~3_+=LZgl*~+wsHawVRKW%gKc~75dkVM|PqX z5O5NzVt|bi8PJ)dASWdg+9UYbTeLjo;pmqE|#QcH<1YK_3oj(zt?!HxMnc^V`Z+v^^)Q@hE$sZBbZyPSOQq&2u31zcfWgJplv5(Oa2T#{>%1bTI zOhq92flvJ4%#x;MSQieiRgcJ_AvA;5qnwOkcwy5%!7}jxzJA0KS}>p3lHe;-m2+Sd z1=KHZD9Pzsr`f&U49AdsWWJ-mbA7A&Y39{ecLEoLlHmR=f!$Q23BxEhi%t^U3tXTN z1u>HXM_9s_b31Bb5*iI#`H(oN=X3tdFQcNaqNu`Mokm?DX&j@F>jNw@B5L~0d06+{ zb?a<{taIftWRg$cfzR*Ji!0(TSprvFpqrQPQwT?u#@pkxXvcBO=b-io4YtL@PPp8O z9{%<1IT5FR^J6M{P&Kj$SMfD6!+1GIkd(l-L|`onnl)L+%3jD?Ig3AwU+a>xiJndUFeOwi5?}%hvz(YE_07j4D zs?cUuexilc-igqf3m%%ZiZ1?WuJ|U|6UQ-A+w{ehePWwhY55X!A4$toAv#iJ+wzH1 zhzwd>SQJPpZb7dY?0I*}Wi4GE9WFsgv2Fhb4Y7gAPdi_oSX8D+Y*EhMdZcHt9 zjpJxMqPMnsbq>WL7auHa84Yvj{|9Yv8CA!&tbyY0?yNwt;4XpSPH=a3cXubaySqCC zcMnc*m*DR9R`x#U+s}p*PK;d-<~?LR0V(m8WvK++{2epGw+XkfDPZQ+Y_lo%OsCBqI%coyT*OjKRypR8^dnQZSP!JbE`SLe=5-4Z)f997?le=s0KO#GZgp zf!g$Hzq098zTjl7%i|(dfz#qHeBffrvK%dC?#BGo8FRR4G$oYCqc=<3U&< z?IOYiSI1;$X-#hB?-gy}ah3a(L-JGvj#EXOug$}T$vsx@1sj0QtdsW0h%>7VfzcSYUj5IQ~bdsNp1(c9)}eQK_` z2ajFFhS7fY0;XC7-8VQFAd?c4zHSlDAdM+R6?Z@rJgI zcIq!~=UeYPh4BvJuKGqf)5ft?_hE*q>GLUPt-Pr>-KPGEg;E)*8f77zIW*z)8#jrY zX3O4wXrXW4BkQ(vSK!A{;iVywZllQEkSMoaH*EAzZr=1l1e2Z8EZO|4cu^IpKJI?b z+yQY4bc7Q-@(97;!A>eP=QFXiT#>WODWHhWEygxh&jTFy>)DZAGjTpA4+;nQlA?&& ztTy8%Zri_37)dog+pzxde8ApevC1hw^m1H=Kj>0>?T%~vxz>o_`GshQ8=_L5j2a0A zH8wCSRMUJRv@v>71D__c{)?5@$P@H2g)*&ip)&sm&INgD>c$0iQB_E4o7!7|MbPQP ziMMKMo_?*GE?z^jJ$g9nArc#ywO9X->#px5Ahk%g$5NzqD!SyY(y4h_%^Q}VWfHE& zzCjug$;`WMB?_C^6c74P6|bWZ7tZ+5Fa+bRtC-_7h$HP(i!u7 z9r%$yci2zId<^AA$NkeN-nOisK7FB8iOcne>y;)2fZSZ2xLA&zTCG|q_&ZgS=TbcB zG}ED5n0Q=TRZ`Kuh8%ONwr;>yn5@M|^CvC;*PpEQuQIIVj7mn!3oO@m6NZo=G;Q@| ze$@>pQ7Hp-^w4&AZ^datFUX;d2VHtYpYsJct6$-H@g`o%!SWS!KtJ(*L-6O87diD9 zqfz4=0aHDHa$0nqeJ#2-I5ZH}+*JRiafSvb7FHQ|(>NbXZN(n?Q#GdJR7bI;&Nkl5 z;TP<)LT)k1O1%q)_4%FRlHc&#$tKA;+&SFFsDK^1$T0cTr}`mdOESgIO!}KmDWw$y zA>@S}KpXVoVTh9oAR!}pDQ-ml!wHj-3@UNh-p6d&-(+k>53nX0OozM;?g$| zc@lm<@Dw@HEVN0h&k#7{d^3f*EKDPGWx8AsPZpgoo3HT>X-PE9T)f`LRUz0!Dct&o zv%6dQX(SPNgOgpQdi#za7I&tLD58kG^aq>bvgp^Zsu-Ua3?kMG@&`3}+{QykzGfz` zISyIDo2)w{AJBfp43m+VL=Dp{5*7Kxum|$IC)rQZduY^KNJD$i9RuawGDvX7)G{@Q zCF7e%@Cyu_S1MxB?yyxS_+|JN_JzSF(~E2PxEB6#KkwWQSLpZ*Rd2+4wlzKBY4a#WFMkPhV- zYM28hAD=&pH<`MBJhqR#uV9T!oV$u`rj0^Nq({Q*zz^SJN zA>>lcs@gAUK;X2N;u*e*QDYQV$qy|Ls#MX5w|=kr!B9Z!znd#P8xKj?a_YOwa8sR{ zqR4Ju6?g(ymn0Tr1PM>H ze_J&li$2)fW;dMwP2Huu$g!2%_gX0-((HmIc zr##LM7s!(4lZp$9X;uhVIy^=|^{VsxI)_U*W&3G!8+zyo@L`_VaddLeQ(0BYJ4A`QFUXQQnHfMu z>TGA#pSpC(-XiNvS?>J0pY$HC>O3vmV$Pl1pP63D7j4Sw3Q~{u=3g5`GVrqtsCDX! zX*J)WwK#7rhg7r#!SX;;tw3iUC7vC!j$>0}TVeYw1g}JZRgCrtfmRvYEkXEf;0<%z ze9U|WZb&O@zEkN=ucVXlU};B9A^_?XL9J?}Q4c=S)d>A>Y6U<(IF^wc6h99^YPvmO_TuCs#6mfrD8x z60z-r>1W=?$4V6b$bJqQ3w`Db;r_P6pq78HOcP$#;-BQ}9<&Figj*fuVC86KZ*#|s zvADHN{IK|5=&9VaY{ljakmyutRLj4+=dg_qLvD2+9R5~>prV!aLBWYR0kJ=?L^O;TNa^dhPztT zGxKvKV#br;hw`^DxrIogY5LEbXT!@XHzMq)dWgKecyF@$-&s0-k-OEt7UjuUitaQ) zSydQVS_^t6Sp}OITRKmVTSwF>8(Mu^;%Bbd+a%=dxd`Q~Bbj;yu()WW{m;`7O8hmMpq-DHP7 zS5^2G@ZnG47S7`tc-S+2)4QuEnfS}Wsrt}Mj^iGmD^_AkV)&WAKSMC3h|b@Bh&5%( ze4l~vEWo^$LnaM=K$eSEY>TO$?GQrr$_C5J^#~Zz?T}YRIgC|UJTeEzQmuahUUn)c(`Za5?Tw&i)5qx1h_xMIc2_u$qN9_7^6NbRw)UF4 z+}$eFi_coiv+Yi?$~e!`tmSHZj@EE^b8@_^J%(dCk;H&RotuOuL4Lc`xEEKzS;cqmh|lO z`)AyD&!KhCufo<&OkIwHw;^ZRPfLwBQJd9`s;7Q8Xv3P!2H!hWW)N_jNUM7g2p69G z#Yrv+j<610x!l~d9M#b@K3XHfLezJD>4{$+wj&|fk7Zf^>`5LXW{u*ZEk=>*gDT1} zfG(rA$u_O7ZfLBapvo`QihtX^3b}?rDpa|r96LN$CYT1&Xl*7+r73@gqyljFlvim* zL?48MjXLC!!WrF8@zB7y89znp!2iC%7Q! zUP@&&=BSZ??>QV-qf>L}>Y7(e{bf1@NjB$-AV#eE{otWr!fdh7SzE>QK_CzZd|++@ zC(d*3_qv6*$VjfJHK>H!)oVU!9k}cb8A<7?+G&S2lOM1Y|B-MR{%za@on_ZbX)cuK^jFeEIUYt33y_KCZ!&o(J`xG+9mBXdqmB z_Y-oPXBC1WKGRRWQ}av)Y>jaXxudopQZU*}cjYg0_H$~d8e|&m)SGGSJQECcYhtMd zEdTsO=oH08!DOvi@Z%xli%Mbsj9ZX{-;VK5=rl~fii=*Fz?xp_Iz!e?h1dn&<=XDr zRlBYoyO7<`WvefSM_g9k9t<((PLHPX)$>E1U(h1$4)FNC6Kf59y%o@R(NjMP!40ak zPxn;);?LPfeK26gq8%(%lEMo&_6`=FwNgLEc90(EzOu!Ji{e0vPtWh& zW-rnLKRcEna?$UsRYwDDZa9IqT3(;_2nq=~wRfDx!teta9^11lK)C;J6wE02B7+lp7w?1`@gxCE={ zl_?B8#g&xO$piutQccA;0Y%-XY{iu&$GUz(_A_Qo1f<(~rcq$ojNlThhEmU#usD(A z2;3TJl@k{~kbWRd4hxhaI9*Gn_WC>iV4sZrKG%R1tjNRYcJ9xr$iaP(i}wkq_&klx z;P#42;f%G)z>iv_dRasZ-_RX>HR1v3r!HzUL|EqZPM(hgw&gn46?w zdLsA@^HZ$-a+FYTC{7$>bk&C*Cz{06h_+hu+Cbmy)XInENVkKRR@KTrHiKzNV+5ik|g7es?i&A+^Q+qP$)IBCH5XqTtuu^67?LGDZ5dvUOcnNGIBGWprFPQ)iOkk z>N#?=v4O^AD7*B9Pu`SoYV|6h!tFAiVjk#g&O(YTMWf2XHN(nsHDY4+N@VrXdGBY@ zzsMS(4RvR-I$LxWI^GI2Cog@SCHv_J3h2>-6~zdQZk6fWE3}|eBFmwfA^j9S+rg!i zuf-O=TJ*lJ{5+t##tu8BVhLNzqT5)i21Bmw|HKngOkEN|V93-(A?CeB%s=!+%-&C^ zcqejw(PP-Phhvy)=|g^s_v%vHEWZyUK!LJ#0cXIt$JsfSYfWx<)enLe6BdtdyOtoY z6@-~;A#knQ;;73KS$KfyOGLJnr3f`8+G|8?mM3- zLw8ZvmF_X?P7<*;&vjjR3**tp*8}zvs?Xjq=!Qe@JFo@^g3PX8p>oK-tHpnCFNwh< z39~9{g4KapKIFUdjo94S*>>tpwwZBSHh4_FK@kYVU*U0px>(LQ;=Vi{b*0B*=$S#QYqd95ETS)ED;yn%0ma8_Bwr9h#kGbaS- zxTQhSg%Mey|*&SN){CLG%sVzq}QR@$}W(ZM7$^et2G z{N+ZJC)$0Y2m4*VR@L`GBB|sh3pUmPdgaT^x*_)dlE%rmzOKs9)`%1?!orMva4pCTAXUQw`Owxe(|X`_C_}CF7r`z9y}q;0#(g)vr1Hcc_{PcVPEBXRUB3aLIQq&xysJPU zHDb(NNL&-Zo3#S|OF@sHe>GTbiay zj-NFr?SAC^7*@xVu0im@v+d(vqt(wMs&GwOS+Z^Nfm_N)1Xmjs(=`A)?bxn)=wyMgd( zuh(RA8F!)QaC+-|>N#p;?&1m~?QuzQ3uv6`_TK{&yPF&3veq<>(HVAsHetWJu)r_T zQE>UC#H&k5IXK}R;^?Y`vqO*-nSs1oSdJg)K>O2&g+&qEMd(Eidv==TU#fX=$JUhP z)O-Tmws-D5fqpl5e>#Xf__pGAhSXE6nXHU~3te6weZKab+{In4B4bJ5b>6eZvwlC2 zs9}D{^8j}w_5Ft_8%HvNfYO=LRfbe1rWC15KCe5DDoGf@e09yE+7WTcKK% zRv;FVSdj|O85;c{=|f^1=+Sb`##v=#j>I<)ES?W!Bj&8cZl_k| zo0ammyeD)IOYKZ_w{3o5#TR0Ceut#-;>!0Nsjs)jC#F80_7Pd*J@PQ})M0w<6urAA z;uHfDo%Gvl7I4SsFh^=tppRt~WT9%+%reZD&90$~yqyf?1_$BYBy;B|J~jYFulFz! zAX;r+X@sP4h|fAMOjRd;e-i|iB4yo&6Jn%dL30??ta0g^rpOiIU^A9*taxjHb)MmI`h?KOSGK3} zUa(Z3Z6)TYZ~qeaX{EB#X7A?2W8R;vS$M_gY;~!{Yi%klc*wCh#}Cj(!RM?oe9VpX z6zARAxLqR>yH5!VhXw&j)j?>4krtLFnpfBR8Cp)p(+XMUeV51q{22`t^- z%r9N{G#qxq$6_Yg;Bmw++4_Y+v?e7M1%$w%e%{9SCEA#3ee&`$7ao{Cf>rG!mU7X`ArnnFEK^$ z!C!`+dR(@Aos$?4O~k$^RfG5cYQXRl2{)o)?ye+NXxp(hv`Kmu>X<@(3Y;AG>xo>W zee!IAnK>=}t>+h+$*^4-lT-y4I)~98Fc&}jQgnK_!wWMS78kCmaBGg0xCLt=ABn!6 z1Ru=b;?(cTY(*tmU0;G&rjkB9$x?=K_$mj>E6{NJ4;P0N$3psAZtfYQpBN19%va1SCnH9zWt1%N zjZ&K$C02+%M1cu|@r`Foo~NoWI~3HaxAdgcTl14!y`y`15b2`cO{9x7hhQ*Uv*YtD zSuPXt*I~>yu@0bLa2b7ah!<7rY`i0vhOm3k?tQkZVSnMmm%9pO%a)_g?HF-zw#lMp z*w8~>_7MGW-8Gbeg&NRe~S2Geh^`joez;gf><6XB) z@>`)~^W|Zdtv&Ogo?Z#lln{FErUB9D(An0YHc(sq&zwd`$fW8Uyi($YFm&U84MD^-ZrsHV2b9kDeu)UW!LUM10n zh|RggqAoq&Z%=|mtoz8IFgC?-7Qj>sjfIUo%frgiUlH}OvW?(E^U=9>vx+XNY4@$O z;wBZgoi~NvdNA#SeHRvOqNiNnSXzmHLx5c^T^^COi>GM5&JKCH3NyzKEnS+sj)Hvu zb+TWyNGq)iNjq6tRAaMEW(3#ijf=cCz;E?BSYi#1QW*Q=w>x4PEjXe3GOZA%GM>UO zGvir1413Cbu6`{|dZg7XebXy)7B!A95~<;;)Gd>sgIvV&mUhwB>peeMOlU5fmQU6S z&8}dL#EP1Yi`ku#I#WN~2?o1FK-dBoe}K|vGHGn@0ST5=&~Knm7KyG4sy*NdnBKA8 z3oB2aUwDMxN7i5In{RnqoIm$|@J)Ltc0)eu2a}iyiEE(ZWL~89e|;EYh>m25YV}Ov z%XE5yq+5Bf-5XM-CiH5ZeDWAonKPl0(a3hHfET&EBQwH*K2jDJe{)z`@gVd5sMBbe43m zZ@nfNSL;%4&n7&Xe(&^@A-THI+RS4sQn?`-X ztCZO;mG)}HLdlH_>r@`<1SyA}*a}61I8S99o4)#M&!tSBiXUX;Xb6d`2ECJvZ@1YK zRNmW|0!Zo4C*L~~b^{A%>p;Xsw0>wRjdcG+byu7ai3>G5wTy$R1 zv&{`8>PO>+jqfwZ<*DLmZv<-n2N8LRU_?sZZ$v@|agc2|sF|u+AJ|RIcK@9O2PLSDtSi_dRlAnD?x! zK;`r055BKTy9(v3EDg&F&6KySx~cbLIzPHXqQ16|A^N?W58CDbFjCpsB{!lu=={oC z{{nWP-gtgyrkA{Me%rDdh{q}fi?rj@20;z3uJ_BkiBvfB@;{M;hpb>AwN-ng!BY18}> zL$`6G;QORlm$*QA*$aKHdK46$W@Mm}wb~j(Ee&ta^&sRSYL;D48yf_brrgT2mccvF z4F}C&Hmnp?I`>(J^KoDU?)15LZv6mhjq`o>(bA}|S!aE&w)pi&*?CvVu8a!X?RO-f zN%zj(ER(xapVGSR-s#wg>`~{EB;G8arL}^o+fb z-w>gt_Mi_|gv&5u8`8pf8-8t4kz}nxa;<8Wn#OCpk1&#+(k1C8Kd$Jw`NmLAQgxY2 zMpdyeLRQv3Z@SN@kp@kg;~^%%5h4*8x@U60fUS1|-QDfaCMyqSDqqW#)l5czdH14q z2kj$HG1^&yNVySoYgG3=Rnty%7O@I!Y=gSlh>11MtkEz@!&<`k@VByD{`m2+%d3-I+x_7(>|D>ac~@rkO|!#SeYeJY{yQYD zCWY7Ov!&vMi|V|Ra|TgQwo%kP&3N5&8h<7~foJ9ffl~mzr>Pe_*OLB5?nkY;x%|rJ z1VQKEMX+5+d^6q${Ci%4hnddR)AwD~V}QQ%3&o1$@cY)Ufk~f>o)8trB@9qH@-7HS zv$Q3bahFrErx%$46CEHZQJ6ieD~wAdW5|O z^M>Hi@!ueS$*%bn=raxD5YPl;=LO>dSA*1objBfWwlW%wayBcmh&|HWF`Naa!@D`; zR}XiI>-whEy|=J-GInt;vT9)4Wk9bddeJW0l3laGyzPN{|D2--;}YNv!Z4B)@bLM= ztj!9siSzJfG0Z!h?YW31);47{^k&+8CA^9scCqVlW2S32HoGeLyY}%}P&wA%7i1Sw zO%&HjfMw4W+KQ<3qlR~wE z9^t3C>GE8BKCPZ?b*V0}!SC6wdA4&(mYs21cq_Tvp&t0b zo9a<9tH;4M{LJVHtWj;DI!Rey<3ebVJ}iPxdJu7fKFuq(dR!VHzN?3c9bcgGZ85rQ57R$-Yx zyx|bm9G7Q+IiqEleFBHdE+Yifj^r06XvUAQVxBH!*#Iv&dy$}?nw5#Sx88jKNYMh`2+j|{Z zmmh)7T>cp%itfS%{((Hu1o-|J!)U|P7f>64mVEb7-`wn@Vn|Kehjzp6gfm=!LbE66 z_7Wda^-hp@BB9Q@nF~<*?Hz)j`mG>IdsiW*&Bt=!cE3<#kafmkyU+XEParhE5gO~E z&U`C|(S5ol<)&mX_dOnL*yZq~@4XB7GMKkb1;Pt*BKSbq^NndJwb9aZlDs<@1+Tlg znZw<4`-`9Bgdv%ELw{0;J zUXH*I=%U}f3k11?FAGV-JSegTQ6y^ZhH(b217i|yP%s{G-c{X1{0Kp<&>2Evki^9g zQJZhLCBA(8yq6H6xSc~VSnr7^tWCi$slF9s2v!%f&(S+^Wf~C0(TOBdib6bd%EnKO zq6q4YA|F!}<_y6oPlBEtaY0!k?xSa$L^hn7ikuh09}b|*pYqqxm5)k`nYEjuZw_CX zwG~4~gIB2pkKg?U2y~QqTnT*(y28+sx>qi!#nj=DC{*Ye&PQ+k8sxAmVm^&b2}S1} zn;sxwqL`L2NKw1K$4Np|MAj7QNFI9yVHsoTgz;YR#`&aMSB>H*eMzvvpO?ZI7&9Og z^m+?IVn#&{ld==x@YZlu=w7GxE5{o$GS;eJPF~Lz0nbRnh!r$TF&zqIXe%BIsV^El z`y|9t#C)v3s0Ky9Lkk1|w!A*1kvqCXSz$1obX_@APu;-fhJ4R*T`ojSr#c^F9iKT1U6Dgt>B2<71QnheLBWL}1Z&)7kHNK52z?TpUkgZJWI zzkmN~SIm@fCvJovL|)L5dh1b0Fw9-~5xbGH2vJtrP`e}pDX-sXMdrmwTNH(f%AN=e zQ>}Ywg-Fe3w5%jUiaJL-Se+AB;M5!EJGmsOS9+m%`$#Zs{PK;X^_XOec?iC}T&387 z^#Y38XE8kU_tIN(>LepcuLPL_5)F}~70TqIq-1s0rNx*U^rihcLNVH5ZN;W#NwX5K z0`0gfx6gxXjbf?pgc3U-1oJeXlz4l^cAow82IuoK0a?#Uw-ES|%jxE{C98#l<6JlC zH;u;G7hQ)d54SO-9bIR>{drycWMVq%*BvmVrvq4(!4ioQa(7{+NjoaHKZ}&SJ+r@( z{RHor0}nwS%cmSliL$25g`ny~}Xq=lfgZR(&q3RMCdK77l^ zyd^PacTd9#YyuR3;a~_}V&)axg?gJ}Up#gD;d8flANO8^2!yiV_GxXG5BjFnu4$I%O@BLfXJUp@?plI;o{E85G`&1BB`iMM7jAMF!g zJ$;`*TxG}F%*+jwJ+bBJP8W=|rQD8&6hvoTtY-Vs(er9hOL-5K=Qo_&ErvMxV;vJG z9D>}G!5x}OO|_ZNbotJ&yxe=3&96G>67E83;Qd)vX-B`L_|AnFX(-_Ti`Gq z%q(m@E(4-@32`lg%h$Vksat+6Dw0%8fGf|tX#AnT@BP3zHAw>rbzD|Ey--#CX@8jn^zl}H%dEBpT zo~Iuj$iK98IuL&fPM{eBT$QqwWmrM67b|eyBGpf^8AQ{D7dAx%FEc> z7Yoo2@k8@HkG`@6*fZcn_^nOp<*F)pF$(2i>yQ~OG;Y15 zMFAF2h*51N0FLi3Mux>9Hb@LhG9Np&+wVHe8z=Db^s%r^*J@ZAMs3iB8!PxW8iF!C zkQoaHnSP$qe5PLyJQEIL0QB`6WQhNi?N?{E`R?hd2UoEa-`t~5Uv{JOKy)pNjM9T@ z^k}f)H?%eIQ%X`1h+u-jN%?)gQr2}PWtq!C-#Lj{Y=mxO8l#f+gieemIv0-K1t6&r zelAh(5*D$ufl(GHgC&6bmQMn=qpDsxl9Hr&a&kh%R9I5NJj~95ZoehsROdJ_3>oA-qD$sWsP-8dU zeL2193K(UN(N5Y;y2qm((~t#x#DmR2L}RO8T!gKp*}fdnwfwf+&Einbbdp-Yo<9^& zqQqn+q1X_gXDURNS5zNPj1UPSI)V%n6u4swo^O!X+Zz|Mt>~7ILAlH<=VPJwZ5jm6 z!WM=^RK!Y(zFSz%C`h!(5R;vSHZX+%o)AN%j^u;dsVO%Ts9uiPj06mAP0ZeUca5nA zr3}?)5lBMV@Nkzl7Q1)ZU$T4&`6qy>J=w%u611-gF=FcBsF;M;q$+|Xj3Tpqg?#+R zBC+QWvMf*r0>=|_ZbriIN^m((O=Eo?Z5m!4tJ=sm$9p{MXKPlpSyaYE zEAwjyJKY!G+dRSB+Y;N_pj1~cr4o$BC6_w?!`PGG$78FB2 zx7}3OpK{segcxp}93-Z6=l4OisLyX)(_Q@P+e$0*C)i~NG*&#d`s*>J*4l?oZyQ=U-toIXvv+X}Em-x6S*PB$I6K3= zXgXcp+iCm$S)Cik@`{hiMIl6#JV=VwrDcVy+%=+Ywj;ja`pO#S1k*Iw(ZktI2xZ1$ z$EUu!vMr)+dUb8@db<{`_2vmn{J!DVy{u(rWkC|o?AAGerxYEdpq?)8_)3hf!2H9l zei%p)#zKe`{pifLor^Dac(Dbn{a3a@JJ5FYS^Uo(BL$N?^pRn;Epd_e)m<9?@o^nk^qAK0#Mp688+qn!I})l#WOzi zQnfw@qTRW!`@^uI`YJ68a3h11@|xJR6tpxMQ-u~ql%@grGYi~G1%gm zr}P!~z3!N&?xw2pQB&i(wQ1krEbfR^1Y_Tu#JpDC0is-za%SOW`MZp{3U1|d z%5rPdorBfs!Sb>UwlUS4QsrxocUnDZLsG|8d)T+bZV#$Tc47eynGcsYsYn$Zw9#u} zE?<{1&*zA?Wk=$t_O$jOE{c|i+n{nDv4e>PeXWA+Vo|gX z5<6lsWAfe1H=ee2CoJzbye$fN>f=e;adUbyn+=lQa-Zh6Maa{SO+{Ax7bAwmkl7h_ zjVMneskAEvI&EqX;vwIfs!M;a4FJl66x6Cfo%(#x&eU3dZds!I`sjgg4JtjWJrzZv z;l$ug^v!WbTA5~h}~ta{SWQ0__xTk*J|I^Yv_mvoDP zZSm%MV0|5mcXp75?<@$R=&D`D1y>bMvg47oOVJ(l31vo^SK{8M8?`O9(?i_isR?H0 z*79R5>*7131@Pj!z=aXxEyWryI769fEfIB(I`?{6y=N(7=??UVX#QVAfMEkMd%Ui2 zS7%U?m!S)BWwbYkkyUQ~`&7fV3A7YDtoeIB_&5je5nA-%tRgIe9a9995*Q9gy*@)7hvTA_42y*C-H z{%N8!+xfA?J0ykVk#MHLdED?|Zt^>G17YY_Q<)r)ER-;riG&U1$<@7OC}rqkfgw>) zLde}YBFGy`iBq|RxqQk}(GPR^HD8K&{8L1I70SKRf*QryAn$2(nace?Vn`(B0^DYU zpB$FeD`CRzI-!(4!O&)n41X92k9#bKRG*eomiF2arXBshrD({>=Ab3#^^4Uiepmvt zlSLXr&r=dTtz8-mkpojcR&%Z@ghY&yY#DI`jAv`%QY!dc1pC~1X|kC3k;V~nY0EtO zrPP}b%L^~_%lpUIAhp}L3XCJ<;cN$;>NkvW;A7xa9(XY&9fghVOBAhF$y=Jl&L@Lg z0*gQ&+)2`l$({|ccTfA*>`ndbdgcqj^Ut`YpaiwsNJFDa&|gMSwrgDA!}Jv{$)y|0 zT?19n5yU-Ruy4(CYOCzI<<))k{=5;Aq;C*!<))2IJ@2G_??}L>I{ouDWhS;!d@3e! zoF4^9%qY$kXMm4c#L2VW&2Qn4sE@>z5$2NP_Xa{e^@?7|dG1*+6y^@*RqKQ%T};X* zsa{8)`O=J&M@$}%E)BsMQzf)FpE#W)GVL*0TShhKtI*Z8)U=$|U?Y?DBKW|0d)A~i z4Qzt0WPL7|Himv_M#3HYIrMxjjs8fB2z#%z8+KJ#XYi1TH3jhQ1Ji$Sf(zAum*a~( z60E?NhKwe)72S&J;pwJ%cx72N+57_U(ghO+8s7Icj}-A0C&7sj zUXz0ZF#w>1;%B7%Envd=Wf9=u;#^QL0{ru!F?e!*feJ_e7RZ$DeAA@A1$a0S{}xyf zJO-U?3&h`wZwu}B*>B~PQvxnbNO=VDTky$2hja1&7P51&a7cgcbd(%_?JYT*zjyy$ zSU3vGj2v6Izm9QL>H(?`N^@c%7xQWPj;ERt6@AV)sQ*jkWI~=+0L4CysR^VN7zV8DN4{8;zZHnE z!0)r)N-97f4`Vhb1?#tf8epL;g8VJ`8EGkt_!uF>sbr;Ke+xf-I1B~<-y$ak4Wr9$ zx&)c>^mbF^Y!aSD=CoN2}lfhad>pOEk`e`FqC%6w$?(-BAkKKOe@r~hQ zCD)lq!n%kG8PSH{XG9mUGyWE+7WWqO7A6kDs$5U>;U_J2JR17$YF`cHp)EXBy$?`v zk8VgmIaxJ08L!tYMcA3iso8wb8V!XB+v`Wdtz9>ylo1Xe7B+D`CBh)QFcXhHz9@*l z!sY2CtxJl(vT}FC%-IAtv#ZI0pPv~HWFMYc79t|g?QcRm`27MkZ=uE63tni1^uB|oykcG*>W5JB_tHVK} zOSorH+pjXI>_O zC(j45J)w^IiP=5!P3Tq)+MhUNH9GWT+*80yh5JmSI&aa>!`dTicf~G+`(te{8{FRb zC!Qxrw0Oi#yux^s8zKfvLAuenryoqiV5l0mecvUkr<$+e3v`_E$q*&0I{EN)Z<%gC z{c=&jm6tj4LNrO_{z?AKyIg$YvXXwXa#DDQf6Bfnd168Eh*V4poqaXp$Z>%x4*#W~ zkKh4mHKRQ4$5wT1*vkThLiXJIJAU%#hJD!?1bvw&nGeKzieGr24xL zSr5WChiYWkRT~njlIaOT-k&Y*S;?%+EqN_;F39mdt)Sm!wLQyFkD`Oad@VuFR!XUU zuc+hWN3N9IWhaf}xq0BytS_kW`gQ3Cs;uZ*Qd`34Bf7q@kc8G>a(uPGIm|LD-b62wFA4ZnuUE(FIezT5C7;$jIAzXjuod?6tQliePMP@!kauD#0n+OA(w2~B{;@O=@cZX7HSxoxdNpT z)F^EPWpii)3stnPAg^w{aX?tTF}SUK_=}-zPN{tA5&`d3Foo<9F3l~(CKP%!As)2^ z0si*OO)Q7-MWFH*j=;#!i&sCPc9y9)4*BI?85=CevAm6(UlWD2#Y^}^I`4<h*DqG(ZrhcFcp+%F1UusT&OiF8@DE3tFB zV<5fe;|yQX^@^F9xLd>^V-ryXdP*-%69AeC5QyI|ymVG08LiTr%5Shx)2Zl6k+u3; zwkQd-N#;jB7NJp^+wTuDwbOzY-eDc?I3;Gq4;&vKmL);O#OaWGsc5tMme!RXl?ETy z9UdJ*t){Vh>_dRvqR%CgTgaHwDC4pwf-SZO#y6@G zkY~F2D3kq~VAz#=5r*`D!@?V+_Zxh@rbJG4*CbIVVv>~g5Ty5ie_ zIMED+`r$jIYs{wi{|PVLzljVf732Xj3IJ&v3o|_cI}4DUOjtny$Z!Tc0A69|1Zq`z z1%NW}MBLiZ$lh4bzz85{V{L3^;$*MqXl7%r$-u}2{4xITpQw|Wp%D#0$-&4T@UP>) zBM8|TI9VB4JO0&@=I^loQUGZqM;9A=3kMn?3mFqLP%_i8)5$Z^0GP#u89AAl82I`5 z1r+7f)%j_D=LXE$>^~Wq{2fqMNQ4GpYU2WMv;i3WD@mjOaRYY!&xU_{(fobK4)y>W zhyQW^PZxh>3;0(IJxe`%D;j{1iqKbJEIkL)zonVA2@OD8256Jw7nG#|d=&zwDZ>vu zmsJ&{0m#`X&;T5a44mxE9NlODj`n)iKtgLXXCr$WfU>NV0@ZJfe)GBk3;u7GyZ;}t zp@O6QA8@Dul>aB_{M-6({U3aQhTj1FR(~S|?CcMSfLH%n>c8p-v;#o`%Ksabfc^cm z?cXo~oBcI1|6kw)bo~n`U|+wH`W+Am)xQ)FE8r0jETH_27El4<0!koWKnVm4D1nFp z<-Z{V9{w-PfW7_&+W+wKu`>P(Hg0YpU8k9&r4cZ|e}?COO_;xD1Qmc@*51a@$pAP- zD6RAifWukY$ld`sp#Y4`bSwZ(N&zP`OG5w?lL#A|CKW)=NzdNV9RMsIuro&^S4S#1 zdVVKIQyXCOzt97AsNke;Ze-y2@6!K&nACsg2#`0jG}3c0`sWn?5AO7W_C|V+zf%wr zSL6rC*cw>_6m4uQEzBH&>HKvOctb*Zj=*XNas3`Oj0_CSj4aGd46N!53>3gWpx;*; z!{07Ir2a2mB#qpFYlEQ!faUkXB@dwI=a-e@`j7RmN%Oy%0$uHmjNyPpi%f7pLc@P; z05(=uW>$bP;9oXYPF7~XU)KQE|FSVMF?0UY2J`^(KmNBL0~-U!KW&UGjGRD*%HP|6 z`eWeW01`$1w+$GB6-eUvUw%NgNXCEK*qIsruraVPF#rqjUu_wgnOJ~>@4szKz`%dl zSegFFkB#Yf>He!NFlY9E+8CKwfUJuD@cXOPf7=+DIDovCf7qD*tN}Za@A4mh%&eS$ z-WxkBkn{2P_Me&p@0ani^dB}>CdNPRgO&BqnzC{*{4-}(HsApKM}MrW z>}-G7*jWA;SFFG(^v}4gZ0vvF1|;DG(uDqZ&g{&LZ2ugS?97~hjAwQh#y{%E&ced< zkG6~qj2u9o)&GjY2%M?^%mrv;XZ~L~16Oe~OCx)@-|Mu3nY+<%3RZe~8yiOekh}D+ zrA6G@*aq;|dJbG?#D#!-tlS2S?3@ftoJ__Xj2!y List[dict]: + """ + Extracts URI annotations from a single or a list of PDF object references on a specific page. + The type of annots (list or not) depends on the pdf formatting. The function detectes the type + of annots and then pass on to get_uris_from_annots function as a List. + + Args: + annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references + representing annotations on the page. + height (float): The height of the page in the specified coordinate system. + coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + List[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ if isinstance(annots, List): return get_uris_from_annots(annots, height, coordinate_system, page_number) return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number) @@ -879,6 +896,21 @@ def get_uris_from_annots( coordinate_system: Union[PixelSpace, PointSpace], page_number: int, ) -> List[dict]: + """ + Extracts URI annotations from a list of PDF object references. + + Args: + annots (List[PDFObjRef]): A list of PDF object references representing annotations on + a page. + height (Union[int, float]): The height of the page in the specified coordinate system. + coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + List[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ annotation_list = [] for annotation in annots: annotation_dict = try_resolve(annotation) @@ -916,6 +948,10 @@ def get_uris_from_annots( def try_resolve(annot: PDFObjRef): + """ + Attempt to resolve a PDF object reference. If successful, returns the resolved object; + otherwise, returns the original reference. + """ try: return annot.resolve() except Exception: @@ -926,6 +962,19 @@ def rect_to_bbox( rect: Tuple[float, float, float, float], height: float, ) -> Tuple[float, float, float, float]: + """ + Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified + coordinate system where the vertical axis is measured from the top of the page. + + Args: + rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle + coordinates (x1, y1, x2, y2). + height (float): The height of the page in the specified coordinate system. + + Returns: + Tuple[float, float, float, float]: A tuple representing the bounding box coordinates + (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page. + """ x1, y2, x2, y1 = rect y1 = height - y1 y2 = height - y2 @@ -936,6 +985,19 @@ def calculate_intersection_area( bbox1: Tuple[float, float, float, float], bbox2: Tuple[float, float, float, float], ) -> float: + """ + Calculate the area of intersection between two bounding boxes. + + Args: + bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box + in the format (x1, y1, x2, y2). + bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of intersection between the two bounding boxes. If there is no + intersection, the function returns 0.0. + """ x1_1, y1_1, x2_1, y2_1 = bbox1 x1_2, y1_2, x2_2, y2_2 = bbox2 @@ -954,6 +1016,16 @@ def calculate_intersection_area( def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float: + """ + Calculate the area of a bounding box. + + Args: + bbox (Tuple[float, float, float, float]): The coordinates of the bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of the bounding box, computed as the product of its width and height. + """ x1, y1, x2, y2 = bbox area = (x2 - x1) * (y2 - y1) return area @@ -965,6 +1037,24 @@ def check_annotations_within_element( page_number: int, threshold: float = 0.9, ) -> List[dict]: + """ + Filter annotations that are within or highly overlap with a specified element on a page. + + Args: + annotation_list (List[dict]): A list of dictionaries, each containing information + about an annotation. + element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the + specified element in the bbox format (x1, y1, x2, y2). + page_number (int): The page number to which the annotations and element belong. + threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines + the minimum overlap required for an annotation to be considered within the element. + Default is 0.9. + + Returns: + List[dict]: A list of dictionaries containing information about annotations that are + within or highly overlap with the specified element on the given page, based on the + specified threshold. + """ annotations_within_element = [] for annotation in annotation_list: if annotation["page_number"] == page_number and ( @@ -980,6 +1070,19 @@ def get_word_bounding_box_from_element( obj: LTTextBox, height: float, ) -> Tuple[List[LTChar], List[dict]]: + """ + Extracts characters and word bounding boxes from a PDF text element. + + Args: + obj (LTTextBox): The PDF text element from which to extract characters and words. + height (float): The height of the page in the specified coordinate system. + + Returns: + Tuple[List[LTChar], List[dict]]: A tuple containing two lists: + - List[LTChar]: A list of LTChar objects representing individual characters. + - List[dict]: A list of dictionaries, each containing information about a word, + including its text, bounding box, and start index in the element's text. + """ characters = [] words = [] text_len = 0 @@ -1002,10 +1105,9 @@ def get_word_bounding_box_from_element( # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 # will need to switch to some pattern matching once we support more languages - if index == 0: + if not word: isalnum = char.isalnum() - - if char.isalnum() != isalnum: + if word and char.isalnum() != isalnum: isalnum = char.isalnum() words.append( {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, @@ -1028,6 +1130,19 @@ def get_word_bounding_box_from_element( def map_bbox_and_index(words: List[dict], annot: dict): + """ + Maps a bounding box annotation to the corresponding text and start index within a list of words. + + Args: + words (List[dict]): A list of dictionaries, each containing information about a word, + including its text, bounding box, and start index. + annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and + "start_index" fields. + + Returns: + dict: The updated annotation dictionary with "text" representing the mapped text and + "start_index" representing the start index of the mapped text in the list of words. + """ if len(words) == 0: annot["text"] = "" annot["start_index"] = -1 @@ -1059,6 +1174,16 @@ def map_bbox_and_index(words: List[dict], annot: dict): def try_argmin(array: np.ndarray) -> int: + """ + Attempt to find the index of the minimum value in a NumPy array. + + Args: + array (np.ndarray): The NumPy array in which to find the minimum value's index. + + Returns: + int: The index of the minimum value in the array. If the array is empty or an + IndexError occurs, it returns -1. + """ try: return int(np.argmin(array)) except IndexError: From bcd0eee7536765168f82e0f25adf32345757ad2b Mon Sep 17 00:00:00 2001 From: Newel H <37004249+newelh@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:54:36 -0400 Subject: [PATCH 27/31] Feat: Detect all text in HTML Heading tags as titles (#1556) ## Summary This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address categorize it as a title. ## Testing ``` from unstructured.partition.html import partition_html elements = partition_html(url="https://www.eda.gov/grants/2015") ``` Before, the date headers at the given url would not be correctly parsed as titles, after this change they are now correctly identified. A unit test to verify the functionality has been added: `test_html_partition::test_html_heading_title_detection` that includes values that were previously detected as narrative text and uncategorized text --- CHANGELOG.md | 13 ++-------- .../partition/test_html_partition.py | 24 ++++++++++++++++++- unstructured/documents/html.py | 7 +++++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a37fea2809..febb223d00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,24 +3,15 @@ ### Enhancements * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. +* **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. +* * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. ### Features ### Fixes * **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable. Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf. - -## 0.10.17-dev3 - -### Enhancements - -* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. - -### Features - -### Fixes - * **Fix various cases of HTML text missing after partition** Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result. Fix: Updated code to deal with these cases. diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 48934f73f4..82976a2621 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -8,7 +8,7 @@ from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.elements import ListItem, NarrativeText, Table, Title +from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title from unstructured.documents.html import HTMLTitle from unstructured.partition.html import partition_html from unstructured.partition.json import partition_json @@ -645,3 +645,25 @@ def test_add_chunking_strategy_on_partition_html( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_html_heading_title_detection(): + html_text = """ +

This is a section of narrative text, it's long, flows and has meaning

+

This is a section of narrative text, it's long, flows and has meaning

+

A heading that is at the second level

+

Finally, the third heading

+

December 1-17, 2017

+

email@example.com

+

  • - bulleted item
  • + """ + elements = partition_html(text=html_text) + assert elements == [ + NarrativeText("This is a section of narrative text, it's long, flows and has meaning"), + Title("This is a section of narrative text, it's long, flows and has meaning"), + Title("A heading that is at the second level"), + Title("Finally, the third heading"), + Title("December 1-17, 2017"), + EmailAddress("email@example.com"), + ListItem("- bulleted item"), + ] diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 1fbbcbcdfa..77afae1e4a 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -389,7 +389,7 @@ def _text_to_element( links=links, emphasized_texts=emphasized_texts, ) - elif is_possible_title(text): + elif is_heading_tag(tag) or is_possible_title(text): return HTMLTitle( text, tag=tag, @@ -431,6 +431,11 @@ def is_narrative_tag(text: str, tag: str) -> bool: return tag not in HEADING_TAGS and is_possible_narrative_text(text) +def is_heading_tag(tag: str) -> bool: + """Uses tag information to infer whether text is a heading.""" + return tag in HEADING_TAGS + + def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> str: """Extracts text from a text tag element.""" text = "" From 1fb464235a2d4fb399244db4ff2ee7a34202b2d6 Mon Sep 17 00:00:00 2001 From: Amanda Cameron Date: Tue, 3 Oct 2023 09:40:34 -0700 Subject: [PATCH 28/31] chore: Table chunking (#1540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You --- CHANGELOG.md | 4 +- test_unstructured/chunking/test_title.py | 51 +++++-- test_unstructured/partition/csv/test_csv.py | 16 ++ test_unstructured/partition/docx/test_docx.py | 43 +++++- test_unstructured/partition/epub/test_epub.py | 21 +++ .../partition/markdown/test_md.py | 2 +- test_unstructured/partition/msg/test_msg.py | 2 +- test_unstructured/partition/odt/test_odt.py | 23 ++- .../partition/pdf-image/test_image.py | 19 +++ .../partition/pdf-image/test_pdf.py | 2 +- test_unstructured/partition/pptx/test_ppt.py | 2 +- test_unstructured/partition/pptx/test_pptx.py | 4 +- .../partition/pypandoc/test_org.py | 2 +- test_unstructured/partition/test_auto.py | 87 ++++++++++- unstructured/__version__.py | 2 +- unstructured/chunking/title.py | 142 +++++++++++------- unstructured/documents/elements.py | 12 ++ unstructured/ingest/interfaces.py | 8 +- unstructured/partition/csv.py | 2 + unstructured/partition/xlsx.py | 2 + unstructured/staging/weaviate.py | 1 + 21 files changed, 356 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index febb223d00..0d06dc15f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev6 +## 0.10.19-dev7 ### Enhancements @@ -6,6 +6,8 @@ * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. * * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. + ### Features diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 8ccfde5af8..bc8bdcc6b0 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -31,7 +31,7 @@ def test_split_elements_by_title_and_table(): Text("It is storming outside."), CheckBox(), ] - sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0) + sections = _split_elements_by_title_and_table(elements, combine_text_under_n_chars=0) assert sections == [ [ @@ -75,7 +75,7 @@ def test_chunk_by_title(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -112,7 +112,7 @@ def test_chunk_by_title_respects_section_change(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -147,7 +147,7 @@ def test_chunk_by_title_separates_by_page_number(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0) + chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -182,7 +182,7 @@ def test_chunk_by_title_groups_across_pages(): Text("It is storming outside."), CheckBox(), ] - chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0) + chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0) assert chunks == [ CompositeElement( @@ -212,24 +212,32 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage(): filename, chunking_strategy="by_title", multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) partitioned_elements_multipage_true_combine_chars_0 = partition_html( filename, chunking_strategy="by_title", multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) elements = partition_html(filename) cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title( elements, multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title( elements, multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) assert ( partitioned_elements_multipage_false_combine_chars_0 @@ -244,7 +252,21 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage(): ) -def test_add_chunking_strategy_raises_error_for_invalid_n_chars(): +@pytest.mark.parametrize( + ("combine_text_under_n_chars", "new_after_n_chars", "max_characters"), + [ + (-1, -1, -1), + (0, 0, 0), + (-5666, -6777, -8999), + (-5, 40, 50), + (50, 100, 20), + ], +) +def test_add_chunking_strategy_raises_error_for_invalid_n_chars( + combine_text_under_n_chars, + new_after_n_chars, + max_characters, +): elements = [ Title("A Great Day"), Text("Today is a great day."), @@ -258,7 +280,12 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(): CheckBox(), ] with pytest.raises(ValueError): - chunk_by_title(elements, combine_under_n_chars=1, new_after_n_chars=0) + chunk_by_title( + elements, + combine_text_under_n_chars=combine_text_under_n_chars, + new_after_n_chars=new_after_n_chars, + max_characters=max_characters, + ) def test_chunk_by_title_drops_extra_metadata(): @@ -335,7 +362,7 @@ def test_chunk_by_title_drops_extra_metadata(): ), ] - chunks = chunk_by_title(elements, combine_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) assert str(chunks[0]) == str( CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."), diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py index 050c2c2567..3f3d5e4ae0 100644 --- a/test_unstructured/partition/csv/test_csv.py +++ b/test_unstructured/partition/csv/test_csv.py @@ -8,6 +8,7 @@ EXPECTED_TEXT, EXPECTED_TEXT_WITH_EMOJI, ) +from unstructured.chunking.title import chunk_by_title from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.csv import partition_csv @@ -189,3 +190,18 @@ def test_partition_csv_with_json(filename, expected_text, expected_table): assert elements[0].metadata.filename == test_elements[0].metadata.filename for i in range(len(elements)): assert elements[i] == test_elements[i] + + +def test_add_chunking_strategy_to_partition_csv_non_default(): + filename = "example-docs/stanley-cups.csv" + + elements = partition_csv(filename=filename) + chunk_elements = partition_csv( + filename, + chunking_strategy="by_title", + max_characters=9, + combine_text_under_n_chars=0, + ) + chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0) + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 9c82c9d471..c622c390b4 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -18,6 +18,7 @@ ListItem, NarrativeText, Table, + TableChunk, Text, Title, ) @@ -422,14 +423,6 @@ def test_partition_docx_with_json(mock_document, expected_elements, tmpdir): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_docx(filename="example-docs/handbook-1p.docx"): - chunk_elements = partition_docx(filename, chunking_strategy="by_title") - elements = partition_docx(filename) - chunks = chunk_by_title(elements) - assert chunk_elements != elements - assert chunk_elements == chunks - - def test_parse_category_depth_by_style(): partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None) @@ -489,3 +482,37 @@ def test_parse_category_depth_by_style_name(): def test_parse_category_depth_by_style_ilvl(): partitioner = _DocxPartitioner(None, None, None, False, None) assert partitioner._parse_category_depth_by_style_ilvl() == 0 + + +def test_add_chunking_strategy_on_partition_docx_default_args( + filename="example-docs/handbook-1p.docx", +): + chunk_elements = partition_docx(filename, chunking_strategy="by_title") + elements = partition_docx(filename) + chunks = chunk_by_title(elements) + + assert chunk_elements != elements + assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_docx( + filename="example-docs/fake-doc-emphasized-text.docx", +): + chunk_elements = partition_docx( + filename, + chunking_strategy="by_title", + max_characters=9, + combine_text_under_n_chars=5, + ) + elements = partition_docx(filename) + chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5) + # remove the last element of the TableChunk list because it will be the leftover slice + # and not necessarily the max_characters len + table_chunks = [chunk for chunk in chunks if isinstance(chunk, TableChunk)][:-1] + other_chunks = [chunk for chunk in chunks if not isinstance(chunk, TableChunk)] + for table_chunk in table_chunks: + assert len(table_chunk.text) == 9 + for chunk in other_chunks: + assert len(chunk.text) >= 5 + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py index 7d0e741899..991ec1991f 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/epub/test_epub.py @@ -193,3 +193,24 @@ def test_add_chunking_strategy_on_partition_epub( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_epub_non_default( + filename=os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub"), +): + elements = partition_epub(filename=filename) + chunk_elements = partition_epub( + filename, + chunking_strategy="by_title", + max_characters=5, + new_after_n_chars=5, + combine_text_under_n_chars=0, + ) + chunks = chunk_by_title( + elements, + max_characters=5, + new_after_n_chars=5, + combine_text_under_n_chars=0, + ) + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/markdown/test_md.py index 33d131b7a3..c73247998c 100644 --- a/test_unstructured/partition/markdown/test_md.py +++ b/test_unstructured/partition/markdown/test_md.py @@ -276,7 +276,7 @@ def test_partition_md_with_json( assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_md( +def test_add_chunking_strategy_by_title_on_partition_md( filename="example-docs/README.md", ): elements = partition_md(filename=filename) diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/msg/test_msg.py index 6e179987a2..7678a6cda5 100644 --- a/test_unstructured/partition/msg/test_msg.py +++ b/test_unstructured/partition/msg/test_msg.py @@ -285,7 +285,7 @@ def test_partition_msg_with_pgp_encrypted_message( assert "Encrypted email detected" in caplog.text -def test_add_chunking_strategy_on_partition_msg( +def test_add_chunking_strategy_by_title_on_partition_msg( filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"), ): elements = partition_msg(filename=filename) diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py index 9fe9b4b99d..982a11f9b4 100644 --- a/test_unstructured/partition/odt/test_odt.py +++ b/test_unstructured/partition/odt/test_odt.py @@ -2,7 +2,7 @@ import pathlib from unstructured.chunking.title import chunk_by_title -from unstructured.documents.elements import Table, Title +from unstructured.documents.elements import Table, TableChunk, Title from unstructured.partition.json import partition_json from unstructured.partition.odt import partition_odt from unstructured.staging.base import elements_to_json @@ -169,3 +169,24 @@ def test_add_chunking_strategy_on_partition_odt( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_add_chunking_strategy_on_partition_odt_non_default(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") + elements = partition_odt(filename=filename) + chunk_elements = partition_odt( + filename, + chunking_strategy="by_title", + max_characters=7, + combine_text_under_n_chars=5, + ) + chunks = chunk_by_title( + elements, + max_characters=7, + combine_text_under_n_chars=5, + ) + for chunk in chunk_elements: + if isinstance(chunk, TableChunk): + assert len(chunk.text) <= 7 + assert chunk_elements != elements + assert chunk_elements == chunks diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index e2c9496356..721eed64dd 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -460,6 +460,25 @@ def test_add_chunking_strategy_on_partition_image( assert chunk_elements == chunks +def test_add_chunking_strategy_on_partition_image_hi_res( + filename="example-docs/layout-parser-paper-with-table.jpg", +): + elements = image.partition_image( + filename=filename, + strategy="hi_res", + infer_table_structure=True, + ) + chunk_elements = image.partition_image( + filename, + strategy="hi_res", + infer_table_structure=True, + chunking_strategy="by_title", + ) + chunks = chunk_by_title(elements) + assert chunk_elements != elements + assert chunk_elements == chunks + + def test_partition_image_uses_model_name(): with mock.patch.object( pdf, diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index e14a793a2a..37af371598 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -838,7 +838,7 @@ def test_partition_pdf_with_ocr_coordinates_are_not_nan_from_file( assert point[1] is not math.nan -def test_add_chunking_strategy_on_partition_pdf( +def test_add_chunking_strategy_by_title_on_partition_pdf( filename="example-docs/layout-parser-paper-fast.pdf", ): elements = pdf.partition_pdf(filename=filename) diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py index 1662002ddd..3750e0e9c6 100644 --- a/test_unstructured/partition/pptx/test_ppt.py +++ b/test_unstructured/partition/pptx/test_ppt.py @@ -174,7 +174,7 @@ def test_partition_ppt_with_json( assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_ppt( +def test_add_chunking_strategy_by_title_on_partition_ppt( filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt"), ): elements = partition_ppt(filename=filename) diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py index 3540c020e7..37e9b7ce3e 100644 --- a/test_unstructured/partition/pptx/test_pptx.py +++ b/test_unstructured/partition/pptx/test_pptx.py @@ -371,8 +371,8 @@ def test_partition_pptx_with_json(): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_pptx(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") +def test_add_chunking_strategy_by_title_on_partition_pptx(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx") elements = partition_pptx(filename=filename) chunk_elements = partition_pptx(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) diff --git a/test_unstructured/partition/pypandoc/test_org.py b/test_unstructured/partition/pypandoc/test_org.py index 9017c5e86f..81ad6d4ed2 100644 --- a/test_unstructured/partition/pypandoc/test_org.py +++ b/test_unstructured/partition/pypandoc/test_org.py @@ -136,7 +136,7 @@ def test_partition_org_with_json(filename="example-docs/README.org"): assert elements[i] == test_elements[i] -def test_add_chunking_strategy_on_partition_org( +def test_add_chunking_strategy_by_title_on_partition_org( filename="example-docs/README.org", ): elements = partition_org(filename=filename) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index dcacf01ba2..a0c907aad3 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -17,6 +17,7 @@ ListItem, NarrativeText, Table, + TableChunk, Text, Title, ) @@ -937,37 +938,45 @@ def test_get_partition_with_extras_prompts_for_install_if_missing(): def test_add_chunking_strategy_on_partition_auto(): filename = "example-docs/example-10k-1p.html" - chunk_elements = partition(filename, chunking_strategy="by_title") elements = partition(filename) + chunk_elements = partition(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks -def test_add_chunking_strategy_on_partition_auto_respects_multipage(): +def test_add_chunking_strategy_title_on_partition_auto_respects_multipage(): filename = "example-docs/example-10k-1p.html" partitioned_elements_multipage_false_combine_chars_0 = partition( filename, chunking_strategy="by_title", multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) partitioned_elements_multipage_true_combine_chars_0 = partition( filename, chunking_strategy="by_title", multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) elements = partition(filename) cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title( elements, multipage_sections=False, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title( elements, multipage_sections=True, - combine_under_n_chars=0, + combine_text_under_n_chars=0, + new_after_n_chars=300, + max_characters=400, ) assert ( partitioned_elements_multipage_false_combine_chars_0 @@ -980,3 +989,69 @@ def test_add_chunking_strategy_on_partition_auto_respects_multipage(): assert len(partitioned_elements_multipage_true_combine_chars_0) != len( partitioned_elements_multipage_false_combine_chars_0, ) + + +def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): + filename = "example-docs/example-10k-1p.html" + + # default chunk size in chars is 200 + partitioned_table_elements_200_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_title", + max_characters=200, + combine_text_under_n_chars=5, + ) + if isinstance(e, (Table, TableChunk)) + ] + + partitioned_table_elements_5_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_title", + max_characters=5, + combine_text_under_n_chars=5, + ) + if isinstance(e, (Table, TableChunk)) + ] + + elements = partition(filename) + + table_elements = [e for e in elements if isinstance(e, Table)] + + assert len(partitioned_table_elements_5_chars) != len(table_elements) + assert len(partitioned_table_elements_200_chars) != len(table_elements) + + assert len(partitioned_table_elements_5_chars[0].text) == 5 + assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5 + + # the first table element is under 200 chars so doesn't get chunked! + assert table_elements[0] == partitioned_table_elements_200_chars[0] + assert len(partitioned_table_elements_200_chars[0].text) < 200 + assert len(partitioned_table_elements_200_chars[1].text) == 200 + assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200 + + +def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation(): + filename = "example-docs/example-10k-1p.html" + + # default chunk size in chars is 200 + partitioned_table_elements_200_chars = [ + e + for e in partition( + filename, + chunking_strategy="by_num_characters", + ) + if isinstance(e, Table) + ] + + i = 0 + for table in partitioned_table_elements_200_chars: + # have to reset the counter to 0 here when we encounter a Table element + if isinstance(table, Table): + i = 0 + if i > 0 and isinstance(table, TableChunk): + assert table.metadata.is_continuation is True + i += 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5f8fd628c9..a4cf981717 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev6" # pragma: no cover +__version__ = "0.10.19-dev7" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 8fd38d62f3..0c5bde799c 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -1,6 +1,7 @@ +import copy import functools import inspect -from typing import Any, Callable, Dict, List, TypeVar +from typing import Any, Callable, Dict, List, Optional, TypeVar, Union from typing_extensions import ParamSpec @@ -9,96 +10,130 @@ Element, ElementMetadata, Table, + TableChunk, Text, Title, ) +def chunk_table_element( + element: Table, + max_characters: Optional[int] = 500, +) -> List[Union[Table, TableChunk]]: + text = element.text + html = getattr(element, "text_as_html", None) + + if len(text) <= max_characters and ( # type: ignore + html is None or len(html) <= max_characters # type: ignore + ): + return [element] + + chunks: List[Union[Table, TableChunk]] = [] + metadata = copy.copy(element.metadata) + is_continuation = False + + while text or html: + text_chunk, text = text[:max_characters], text[max_characters:] + table_chunk = TableChunk(text=text_chunk, metadata=copy.copy(metadata)) + + if html: + html_chunk, html = html[:max_characters], html[max_characters:] + table_chunk.metadata.text_as_html = html_chunk + + if is_continuation: + table_chunk.metadata.is_continuation = True + + chunks.append(table_chunk) + is_continuation = True + + return chunks + + def chunk_by_title( elements: List[Element], multipage_sections: bool = True, - combine_under_n_chars: int = 500, - new_after_n_chars: int = 1500, + combine_text_under_n_chars: int = 500, + new_after_n_chars: int = 500, + max_characters: int = 500, ) -> List[Element]: """Uses title elements to identify sections within the document for chunking. Splits off into a new section when a title is detected or if metadata changes, which happens when page numbers or sections change. Cuts off sections once they have exceeded - a character length of new_after_n_chars. + a character length of max_characters. Parameters ---------- elements - A list of unstructured elements. Usually the ouput of a partition functions. + A list of unstructured elements. Usually the output of a partition functions. multipage_sections If True, sections can span multiple pages. Defaults to True. - combine_under_n_chars + combine_text_under_n_chars Combines elements (for example a series of titles) until a section reaches a length of n characters. new_after_n_chars - Cuts off new sections once they reach a length of n characters + Cuts off new sections once they reach a length of n characters (soft max) + max_characters + Chunks table elements text and text_as_html into chunks of length n characters (hard max) + TODO: (amanda) extend to other elements """ if ( - combine_under_n_chars is not None + combine_text_under_n_chars is not None and new_after_n_chars is not None + and max_characters is not None and ( - combine_under_n_chars > new_after_n_chars - or combine_under_n_chars < 0 + combine_text_under_n_chars > new_after_n_chars + or combine_text_under_n_chars < 0 or new_after_n_chars < 0 + or max_characters <= 0 + or combine_text_under_n_chars > max_characters ) ): raise ValueError( - "Invalid values for combine_under_n_chars and/or new_after_n_chars.", + "Invalid values for combine_text_under_n_chars and/or max_characters.", ) chunked_elements: List[Element] = [] sections = _split_elements_by_title_and_table( elements, multipage_sections=multipage_sections, - combine_under_n_chars=combine_under_n_chars, + combine_text_under_n_chars=combine_text_under_n_chars, new_after_n_chars=new_after_n_chars, ) - for section in sections: if not section: continue - if not isinstance(section[0], Text) or isinstance(section[0], Table): - chunked_elements.extend(section) - elif isinstance(section[0], Text): - text = "" - metadata = section[0].metadata + first_element = section[0] - for i, element in enumerate(section): - if isinstance(element, Text): - text += "\n\n" if text else "" - start_char = len(text) - text += element.text + if not isinstance(first_element, Text): + chunked_elements.extend(section) + continue - for attr, value in vars(element.metadata).items(): - if not isinstance(value, list): - continue + elif isinstance(first_element, Table): + chunked_elements.extend(chunk_table_element(first_element, max_characters)) + continue - _value = getattr(metadata, attr, []) - if _value is None: - _value = [] + text = "" + metadata = first_element.metadata + start_char = 0 + for element in section: + if isinstance(element, Text): + text += "\n\n" if text else "" + start_char = len(text) + text += element.text + for attr, value in vars(element.metadata).items(): + if isinstance(value, list): + _value = getattr(metadata, attr, []) or [] if attr == "regex_metadata": for item in value: item["start"] += start_char item["end"] += start_char - if i > 0: - # NOTE(newelh): Previously, _value was extended with value. - # This caused a memory error if the content was a list of strings - # with a large number of elements -- doubling the list size each time. - # This now instead ensures that the _value list is unique and updated. - for item in value: - if item not in _value: - _value.append(item) - - setattr(metadata, attr, _value) + _value.extend(item for item in value if item not in _value) + setattr(metadata, attr, _value) - chunked_elements.append(CompositeElement(text=text, metadata=metadata)) + chunked_elements.append(CompositeElement(text=text, metadata=metadata)) return chunked_elements @@ -106,8 +141,8 @@ def chunk_by_title( def _split_elements_by_title_and_table( elements: List[Element], multipage_sections: bool = True, - combine_under_n_chars: int = 500, - new_after_n_chars: int = 1500, + combine_text_under_n_chars: int = 500, + new_after_n_chars: int = 500, ) -> List[List[Element]]: sections: List[List[Element]] = [] section: List[Element] = [] @@ -123,11 +158,11 @@ def _split_elements_by_title_and_table( ) section_length = sum([len(str(element)) for element in section]) - new_section = (isinstance(element, Title) and section_length > combine_under_n_chars) or ( - not metadata_matches or section_length > new_after_n_chars - ) + new_section = ( + isinstance(element, Title) and section_length > combine_text_under_n_chars + ) or (not metadata_matches or section_length > new_after_n_chars) - if isinstance(element, Table) or not isinstance(element, Text): + if not isinstance(element, Text) or isinstance(element, Table): sections.append(section) sections.append([element]) section = [] @@ -185,7 +220,7 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[ """Decorator for chuncking text. Uses title elements to identify sections within the document for chunking. Splits off a new section when a title is detected or if metadata changes, which happens when page numbers or sections change. Cuts off sections once they have exceeded - a character length of new_after_n_chars.""" + a character length of max_characters.""" def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: if func.__doc__ and ( @@ -199,11 +234,15 @@ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]: + "\n\tAdditional Parameters:" + "\n\t\tmultipage_sections" + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True." - + "\n\t\tcombine_under_n_chars" + + "\n\t\tcombine_text_under_n_chars" + "\n\t\t\tCombines elements (for example a series of titles) until a section" + "\n\t\t\treaches a length of n characters." + "\n\t\tnew_after_n_chars" - + "\n\t\t\tCuts off new sections once they reach a length of n characters" + + "\n\t\t\t Cuts off new sections once they reach a length of n characters" + + "\n\t\t\t a soft max." + + "\n\t\tmax_characters" + + "\n\t\t\tChunks table elements text and text_as_html into chunks" + + "\n\t\t\tof length n characters, a hard max." ) @functools.wraps(func) @@ -218,8 +257,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: elements = chunk_by_title( elements, multipage_sections=params.get("multipage_sections", True), - combine_under_n_chars=params.get("combine_under_n_chars", 500), - new_after_n_chars=params.get("new_after_n_chars", 1500), + combine_text_under_n_chars=params.get("combine_text_under_n_chars", 500), + new_after_n_chars=params.get("new_after_n_chars", 500), + max_characters=params.get("max_characters", 500), ) return elements diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index f051e1b4f6..75c15a3e36 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -185,6 +185,10 @@ class ElementMetadata: # Metadata extracted via regex regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None + # Chunking metadata fields + num_characters: Optional[int] = None + is_continuation: Optional[bool] = None + # Detection Model Class Probabilities from Unstructured-Inference Hi-Res detection_class_prob: Optional[float] = None @@ -566,6 +570,14 @@ class Table(Text): pass +class TableChunk(Table): + """An element for capturing chunks of tables.""" + + category = "Table" + + pass + + class Header(Text): """An element for capturing document headers.""" diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index c76fdfb783..caefa50afd 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -83,16 +83,16 @@ def get_embedder(self) -> BaseEmbeddingEncoder: class ChunkingConfig(BaseConfig): chunk_elements: bool = False multipage_sections: bool = True - combine_under_n_chars: int = 500 - new_after_n_chars: int = 1500 + combine_text_under_n_chars: int = 500 + max_characters: int = 1500 def chunk(self, elements: t.List[Element]) -> t.List[Element]: if self.chunk_elements: return chunk_by_title( elements=elements, multipage_sections=self.multipage_sections, - combine_under_n_chars=self.combine_under_n_chars, - new_after_n_chars=self.new_after_n_chars, + combine_text_under_n_chars=self.combine_text_under_n_chars, + max_characters=self.max_characters, ) else: return elements diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 6a7314de03..2528f321cd 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -4,6 +4,7 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring +from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( Element, ElementMetadata, @@ -21,6 +22,7 @@ @process_metadata() @add_metadata_with_filetype(FileType.CSV) +@add_chunking_strategy() def partition_csv( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 2f4538210f..ebffd6cdf9 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -4,6 +4,7 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring +from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( Element, ElementMetadata, @@ -21,6 +22,7 @@ @process_metadata() @add_metadata_with_filetype(FileType.XLSX) +@add_chunking_strategy() def partition_xlsx( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, diff --git a/unstructured/staging/weaviate.py b/unstructured/staging/weaviate.py index c6efc80bd4..4a4e15276c 100644 --- a/unstructured/staging/weaviate.py +++ b/unstructured/staging/weaviate.py @@ -15,6 +15,7 @@ class Properties(TypedDict): "regex_metadata", "emphasized_texts", "detection_class_prob", + "is_continuation", ) From 8821689f3659eaa98a01b22a3136996ceff27cf2 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:31:28 -0400 Subject: [PATCH 29/31] Roman/s3 minio all cloud support (#1606) ### Description Exposes the endpoint url as an access kwarg when using the s3 filesystem library via the fsspec abstraction. This allows for any non-aws data providers that support the s3 protocol to be used with the s3 connector (i.e. minio) Closes out https://github.com/Unstructured-IO/unstructured/issues/950 --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 --- CHANGELOG.md | 8 ++-- .../create-and-check-minio.sh | 25 ++++++++++ .../minio-test-helpers/docker-compose.yaml | 13 ++++++ .../wiki_movie_plots_small.csv | 31 +++++++++++++ .../s3-minio/wiki_movie_plots_small.csv.json | 19 ++++++++ .../test-ingest-s3-minio.sh | 46 +++++++++++++++++++ test_unstructured_ingest/test-ingest.sh | 1 + unstructured/__version__.py | 2 +- unstructured/ingest/cli/cmds/s3.py | 9 ++++ unstructured/ingest/runner/s3.py | 6 ++- unstructured/ingest/runner/writers.py | 7 ++- 11 files changed, 161 insertions(+), 6 deletions(-) create mode 100755 scripts/minio-test-helpers/create-and-check-minio.sh create mode 100644 scripts/minio-test-helpers/docker-compose.yaml create mode 100644 scripts/minio-test-helpers/wiki_movie_plots_small.csv create mode 100644 test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json create mode 100755 test_unstructured_ingest/test-ingest-s3-minio.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d06dc15f9..23878a7c2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,18 @@ -## 0.10.19-dev7 +## 0.10.19-dev8 ### Enhancements * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. -* * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. - +* **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio). ### Features +### Features + ### Fixes * **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable. Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf. diff --git a/scripts/minio-test-helpers/create-and-check-minio.sh b/scripts/minio-test-helpers/create-and-check-minio.sh new file mode 100755 index 0000000000..09089a944a --- /dev/null +++ b/scripts/minio-test-helpers/create-and-check-minio.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$(dirname "$(realpath "$0")") + +secret_key=minioadmin +access_key=minioadmin +region=us-east-2 +endpoint_url=http://localhost:9000 +bucket_name=utic-dev-tech-fixtures + +function upload(){ + echo "Uploading test content to new bucket in minio" + AWS_REGION=$region AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ + aws --output json --endpoint-url $endpoint_url s3api create-bucket --bucket $bucket_name | jq + AWS_REGION=$region AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ + aws --endpoint-url $endpoint_url s3 cp "$SCRIPT_DIR"/wiki_movie_plots_small.csv s3://$bucket_name/ +} + +# Create Minio single server +docker compose version +docker compose -f "$SCRIPT_DIR"/docker-compose.yaml up --wait +docker compose -f "$SCRIPT_DIR"/docker-compose.yaml ps + +echo "Cluster is live." +upload diff --git a/scripts/minio-test-helpers/docker-compose.yaml b/scripts/minio-test-helpers/docker-compose.yaml new file mode 100644 index 0000000000..acc3ec9b48 --- /dev/null +++ b/scripts/minio-test-helpers/docker-compose.yaml @@ -0,0 +1,13 @@ +services: + minio: + image: quay.io/minio/minio + container_name: minio-test + ports: + - 9000:9000 + - 9001:9001 + command: server --console-address ":9001" /data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 5s + timeout: 20s + retries: 3 diff --git a/scripts/minio-test-helpers/wiki_movie_plots_small.csv b/scripts/minio-test-helpers/wiki_movie_plots_small.csv new file mode 100644 index 0000000000..2fbb2b49bb --- /dev/null +++ b/scripts/minio-test-helpers/wiki_movie_plots_small.csv @@ -0,0 +1,31 @@ +Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot +1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]" +1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better." +1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination. +In the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice." +1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs." +1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince." +1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film),"Alice follows a large white rabbit down a ""Rabbit-hole"". She finds a tiny door. When she finds a bottle labeled ""Drink me"", she does, and shrinks, but not enough to pass through the door. She then eats something labeled ""Eat me"" and grows larger. She finds a fan when enables her to shrink enough to get into the ""Garden"" and try to get a ""Dog"" to play with her. She enters the ""White Rabbit's tiny House,"" but suddenly resumes her normal size. In order to get out, she has to use the ""magic fan."" +She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. ""The Duchess's Cheshire Cat"" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's ""Mad Tea-Party."" After a while, she leaves. +The Queen invites Alice to join the ""ROYAL PROCESSION"": a parade of marching playing cards and others headed by the White Rabbit. When Alice ""unintentionally offends the Queen"", the latter summons the ""Executioner"". Alice ""boxes the ears"", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream." +1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film),"The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits‍—‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left. +Meanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water. +There is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail." +1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,"The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest." +1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train_Robbery,"The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The ""Bandit Queen,"" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The ""Bandit Queen"" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin. +The next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the ""valuables,"" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies. +In the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the ""plunder."" The police, however, have struck the right trail and are in close pursuit. While the ""plunder"" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the ""Bandit Queen.""" +1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film),"Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents." +1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film),"The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed." +1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies,The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart. +1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film),"Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration. +Film historian Charles Musser writes of Porter's adaptation, ""O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.""[1]" +1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film),"Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]" +1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game,"Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]" +1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film,"The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets ""catches"" the laughter from her, including a vendor and police officers." +1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_of_Dollie,"On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents." +1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,"A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house." +1908,A Calamitous Elopement,American,D.W. Griffith,"Harry Solter, Linda Arvidson",comedy,https://en.wikipedia.org/wiki/A_Calamitous_Elopement,"A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings." +1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film),"A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as ""The Biograph Girl.""" +1908,A Christmas Carol,American,Unknown,Tom Ricketts,drama,https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film),"No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life." +1908,The Fight for Freedom,American,D. W. Griffith,"Florence Auer, John G. Adolfi",western,https://en.wikipedia.org/wiki/The_Fight_for_Freedom,"The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town." diff --git a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json new file mode 100644 index 0000000000..0a44c84aba --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json @@ -0,0 +1,19 @@ +[ + { + "type": "Table", + "element_id": "f078b58f281b4e231430e34a3ece07f3", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/wiki_movie_plots_small.csv", + "version": 103589111396252091980300895568390462924, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/wiki_movie_plots_small.csv" + } + }, + "filetype": "text/csv", + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
    1901Kansas Saloon SmashersAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Kansas_Saloon_SmashersA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
    1901Love by the Light of the MoonAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_MoonThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.
    1901The Martyred PresidentsAmericanUnknownunknownhttps://en.wikipedia.org/wiki/The_Martyred_PresidentsThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.
    1901Terrible Teddy, the Grizzly KingAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_KingLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.
    1902Jack and the BeanstalkAmericanGeorge S. Fleming, Edwin S. Porterunknownhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.
    1903Alice in WonderlandAmericanCecil HepworthMay Clarkunknownhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.
    1903The Great Train RobberyAmericanEdwin S. Porterwesternhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits‍—‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.
    1904The SuburbaniteAmericanWallace McCutcheoncomedyhttps://en.wikipedia.org/wiki/The_SuburbaniteThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.
    1905The Little Train RobberyAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Little_Train_RobberyThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"
    1905The Night Before ChristmasAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.
    1906Dream of a Rarebit FiendAmericanWallace McCutcheon and Edwin S. Portershorthttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.
    1906From Leadville to Aspen: A Hold-Up in the RockiesAmericanFrancis J. Marion and Wallace McCutcheonshort action/crime westernhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_RockiesThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
    1906Kathleen MavourneenAmericanEdwin S. Portershort filmhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]
    1907Daniel BooneAmericanWallace McCutcheon and Ediwin S. PorterWilliam Craven, Florence Lawrencebiographicalhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]
    1907How Brown Saw the Baseball GameAmericanUnknownUnknowncomedyhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_GameBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]
    1907Laughing GasAmericanEdwin Stanton PorterBertha Regustus, Edward Bouldencomedyhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_FilmThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.
    1908The Adventures of DollieAmericanD. W. GriffithArthur V. Johnson, Linda Arvidsondramahttps://en.wikipedia.org/wiki/The_Adventures_of_DollieOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.
    1908The Black ViperAmericanD. W. GriffithD. W. Griffithdramahttps://en.wikipedia.org/wiki/The_Black_ViperA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.
    1908A Calamitous ElopementAmericanD.W. GriffithHarry Solter, Linda Arvidsoncomedyhttps://en.wikipedia.org/wiki/A_Calamitous_ElopementA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.
    1908The Call of the WildAmericanD. W. GriffithCharles Insleeadventurehttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"
    1908A Christmas CarolAmericanUnknownTom Rickettsdramahttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.
    1908The Fight for FreedomAmericanD. W. GriffithFlorence Auer, John G. Adolfiwesternhttps://en.wikipedia.org/wiki/The_Fight_for_FreedomThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.
    " + }, + "text": "\n\n\n1901\nKansas Saloon Smashers\nAmerican\nUnknown\n\nunknown\nhttps://en.wikipedia.org/wiki/Kansas_Saloon_Smashers\nA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]\n\n\n1901\nLove by the Light of the Moon\nAmerican\nUnknown\n\nunknown\nhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon\nThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.\n\n\n1901\nThe Martyred Presidents\nAmerican\nUnknown\n\nunknown\nhttps://en.wikipedia.org/wiki/The_Martyred_Presidents\nThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.\n\n\n1901\nTerrible Teddy, the Grizzly King\nAmerican\nUnknown\n\nunknown\nhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King\nLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.\n\n\n1902\nJack and the Beanstalk\nAmerican\nGeorge S. Fleming, Edwin S. Porter\n\nunknown\nhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)\nThe earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.\n\n\n1903\nAlice in Wonderland\nAmerican\nCecil Hepworth\nMay Clark\nunknown\nhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)\nAlice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.\n\n\n1903\nThe Great Train Robbery\nAmerican\nEdwin S. Porter\n\nwestern\nhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)\nThe film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits‍—‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.\n\n\n1904\nThe Suburbanite\nAmerican\nWallace McCutcheon\n\ncomedy\nhttps://en.wikipedia.org/wiki/The_Suburbanite\nThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.\n\n\n1905\nThe Little Train Robbery\nAmerican\nEdwin Stanton Porter\n\nunknown\nhttps://en.wikipedia.org/wiki/The_Little_Train_Robbery\nThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"\n\n\n1905\nThe Night Before Christmas\nAmerican\nEdwin Stanton Porter\n\nunknown\nhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)\nScenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.\n\n\n1906\nDream of a Rarebit Fiend\nAmerican\nWallace McCutcheon and Edwin S. Porter\n\nshort\nhttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)\nThe Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.\n\n\n1906\nFrom Leadville to Aspen: A Hold-Up in the Rockies\nAmerican\nFrancis J. Marion and Wallace McCutcheon\n\nshort action/crime western\nhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies\nThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.\n\n\n1906\nKathleen Mavourneen\nAmerican\nEdwin S. Porter\n\nshort film\nhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)\nIrish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]\n\n\n1907\nDaniel Boone\nAmerican\nWallace McCutcheon and Ediwin S. Porter\nWilliam Craven, Florence Lawrence\nbiographical\nhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)\nBoone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]\n\n\n1907\nHow Brown Saw the Baseball Game\nAmerican\nUnknown\nUnknown\ncomedy\nhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game\nBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]\n\n\n1907\nLaughing Gas\nAmerican\nEdwin Stanton Porter\nBertha Regustus, Edward Boulden\ncomedy\nhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film\nThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.\n\n\n1908\nThe Adventures of Dollie\nAmerican\nD. W. Griffith\nArthur V. Johnson, Linda Arvidson\ndrama\nhttps://en.wikipedia.org/wiki/The_Adventures_of_Dollie\nOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.\n\n\n1908\nThe Black Viper\nAmerican\nD. W. Griffith\nD. W. Griffith\ndrama\nhttps://en.wikipedia.org/wiki/The_Black_Viper\nA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.\n\n\n1908\nA Calamitous Elopement\nAmerican\nD.W. Griffith\nHarry Solter, Linda Arvidson\ncomedy\nhttps://en.wikipedia.org/wiki/A_Calamitous_Elopement\nA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.\n\n\n1908\nThe Call of the Wild\nAmerican\nD. W. Griffith\nCharles Inslee\nadventure\nhttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)\nA white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"\n\n\n1908\nA Christmas Carol\nAmerican\nUnknown\nTom Ricketts\ndrama\nhttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)\nNo prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.\n\n\n1908\nThe Fight for Freedom\nAmerican\nD. W. Griffith\nFlorence Auer, John G. Adolfi\nwestern\nhttps://en.wikipedia.org/wiki/The_Fight_for_Freedom\nThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.\n\n\n" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-s3-minio.sh b/test_unstructured_ingest/test-ingest-s3-minio.sh new file mode 100755 index 0000000000..000c28e28b --- /dev/null +++ b/test_unstructured_ingest/test-ingest-s3-minio.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=s3-minio +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +secret_key=minioadmin +access_key=minioadmin + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + +function cleanup() { + # Kill the container so the script can be repeatedly run using the same ports + echo "Stopping Minio Docker container" + docker-compose -f scripts/minio-test-helpers/docker-compose.yaml down --remove-orphans -v + + cleanup_dir "$OUTPUT_DIR" +} + +trap cleanup EXIT + +# shellcheck source=/dev/null +scripts/minio-test-helpers/create-and-check-minio.sh +wait + +AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./unstructured/ingest/main.py \ + s3 \ + --num-processes "$max_processes" \ + --download-dir "$DOWNLOAD_DIR" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_modified,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --strategy hi_res \ + --preserve-downloads \ + --reprocess \ + --output-dir "$OUTPUT_DIR" \ + --verbose \ + --remote-url s3://utic-dev-tech-fixtures/ \ + --endpoint-url http://localhost:9000 + + +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 56568b37f0..8c2dffc977 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -10,6 +10,7 @@ export OMP_THREAD_LIMIT=1 scripts=( 'test-ingest-s3.sh' +'test-ingest-s3-minio.sh' 'test-ingest-azure.sh' 'test-ingest-biomed-api.sh' 'test-ingest-biomed-path.sh' diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a4cf981717..acf12be0ae 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev7" # pragma: no cover +__version__ = "0.10.19-dev8" # pragma: no cover diff --git a/unstructured/ingest/cli/cmds/s3.py b/unstructured/ingest/cli/cmds/s3.py index 88c46fdb25..34a7845f1b 100644 --- a/unstructured/ingest/cli/cmds/s3.py +++ b/unstructured/ingest/cli/cmds/s3.py @@ -1,4 +1,5 @@ import logging +import typing as t from dataclasses import dataclass import click @@ -22,6 +23,7 @@ @dataclass class S3CliConfig(BaseConfig, CliMixin): anonymous: bool = False + endpoint_url: t.Optional[str] = None @staticmethod def add_cli_options(cmd: click.Command) -> None: @@ -32,6 +34,13 @@ def add_cli_options(cmd: click.Command) -> None: default=False, help="Connect to s3 without local AWS credentials.", ), + click.Option( + ["--endpoint-url"], + type=str, + default=None, + help="Use this endpoint_url, if specified. Needed for " + "connecting to non-AWS S3 buckets.", + ), ] cmd.params.extend(options) diff --git a/unstructured/ingest/runner/s3.py b/unstructured/ingest/runner/s3.py index 292270e50a..e3646305fa 100644 --- a/unstructured/ingest/runner/s3.py +++ b/unstructured/ingest/runner/s3.py @@ -15,6 +15,7 @@ def s3( verbose: bool = False, recursive: bool = False, anonymous: bool = False, + endpoint_url: t.Optional[str] = None, writer_type: t.Optional[str] = None, writer_kwargs: t.Optional[dict] = None, **kwargs, @@ -31,11 +32,14 @@ def s3( from unstructured.ingest.connector.s3 import S3SourceConnector, SimpleS3Config + access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} + if endpoint_url: + access_kwargs["endpoint_url"] = endpoint_url source_doc_connector = S3SourceConnector( # type: ignore connector_config=SimpleS3Config( path=remote_url, recursive=recursive, - access_kwargs={"anon": anonymous}, + access_kwargs=access_kwargs, ), read_config=read_config, partition_config=partition_config, diff --git a/unstructured/ingest/runner/writers.py b/unstructured/ingest/runner/writers.py index 46a875035e..7be5073c0f 100644 --- a/unstructured/ingest/runner/writers.py +++ b/unstructured/ingest/runner/writers.py @@ -9,6 +9,7 @@ def s3_writer( remote_url: str, anonymous: bool, + endpoint_url: t.Optional[str] = None, verbose: bool = False, **kwargs, ): @@ -17,11 +18,15 @@ def s3_writer( SimpleS3Config, ) + access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} + if endpoint_url: + access_kwargs["endpoint_url"] = endpoint_url + return S3DestinationConnector( write_config=WriteConfig(), connector_config=SimpleS3Config( path=remote_url, - access_kwargs={"anon": anonymous}, + access_kwargs=access_kwargs, ), ) From 13453d63587347e6f185cb4bfd32eeb8e6fda7aa Mon Sep 17 00:00:00 2001 From: Manirevuri <113291632+Manirevuri@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:42:32 -0600 Subject: [PATCH 30/31] Fix: Documentation for Unstructured API's (#1624) Fixed "files=file_data" param for all python files --------- Co-authored-by: Austin Walker --- CHANGELOG.md | 2 +- docs/source/api.rst | 20 ++++++++++---------- unstructured/__version__.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23878a7c2f..09437d9523 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev8 +## 0.10.19-dev9 ### Enhancements diff --git a/docs/source/api.rst b/docs/source/api.rst index 7ade12ab32..3d682940d8 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -108,7 +108,7 @@ When elements are extracted from PDFs or images, it may be useful to get their b file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -155,7 +155,7 @@ You can specify the encoding to use to decode the text input. If no value is pro file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -204,7 +204,7 @@ You can also specify what languages to use for OCR with the ``ocr_languages`` kw file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -250,7 +250,7 @@ By default the result will be in ``json``, but it can be set to ``text/csv`` to file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -296,7 +296,7 @@ Pass the `include_page_breaks` parameter to `true` to include `PageBreak` elemen file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -345,7 +345,7 @@ On the other hand, ``hi_res`` is the better choice for PDFs that may have text w file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -398,7 +398,7 @@ To use the ``hi_res`` strategy with **Chipper** model, pass the argument for ``h file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -451,7 +451,7 @@ To extract the table structure from PDF files using the ``hi_res`` strategy, ens file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -499,7 +499,7 @@ We also provide support for enabling and disabling table extraction for file typ file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() @@ -545,7 +545,7 @@ When processing XML documents, set the ``xml_keep_tags`` parameter to ``true`` t file_path = "/Path/To/File" file_data = {'files': open(file_path, 'rb')} - response = requests.post(url, headers=headers, files=files, data=data) + response = requests.post(url, headers=headers, files=file_data, data=data) file_data['files'].close() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index acf12be0ae..d71d465e92 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev8" # pragma: no cover +__version__ = "0.10.19-dev9" # pragma: no cover From 19d8bff27515a6f299412828e61b311fa3f0d523 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Oct 2023 22:28:47 -0500 Subject: [PATCH 31/31] feat: change default hi_res model to yolox quantized (#1607) --- CHANGELOG.md | 5 +- .../partition/pdf-image/test_image.py | 2 +- .../partition/pdf-image/test_pdf.py | 4 +- test_unstructured/partition/test_auto.py | 3 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 140 +- .../azure/IRS-form-1987.pdf.json | 1018 +++++-- .../azure/IRS-form-1987.png.json | 470 +++- .../biomed-api/65/11/main.PMC6312790.pdf.json | 1010 ++----- .../biomed-api/75/29/main.PMC6312793.pdf.json | 312 ++- .../07/07/sbaa031.073.PMC7234218.pdf.json | 22 +- .../layout-parser-paper.pdf.json | 752 ++++-- .../898538f2-26e1-4de7-81e6-354045d4d007.json | 28 + .../2023-Jan-economic-outlook.pdf.json | 2184 +++++---------- .../small-pdf-set/Silent-Giant-(1).pdf.json | 2378 +++-------------- .../recalibrating-risk-report.pdf.json | 790 ++---- unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 3 +- unstructured/partition/pdf.py | 13 +- 18 files changed, 3587 insertions(+), 5549 deletions(-) create mode 100644 test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 09437d9523..8a1f7b1110 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.19-dev9 +## 0.10.19-dev10 ### Enhancements @@ -8,8 +8,7 @@ * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio). - -### Features +* **change default `hi_res` model for pdf/image partition to `yolox`** Now partitioning pdf/image using `hi_res` strategy utilizes `yolox_quantized` model isntead of `detectron2_onnx` model. This new default model has better recall for tables and produces more detailed categories for elements. ### Features diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index 721eed64dd..bdbb24df8f 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -440,7 +440,7 @@ def test_partition_image_formats_languages_for_tesseract(): ocr_languages="jpn_vert", ocr_mode="entire_page", extract_tables=False, - model_name="detectron2_onnx", + model_name=pdf.default_hi_res_model(), ) diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 37af371598..b33b1ca337 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -407,7 +407,7 @@ def test_partition_pdf_with_dpi(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name="detectron2_onnx", + model_name=pdf.default_hi_res_model(), pdf_image_dpi=100, ) @@ -858,7 +858,7 @@ def test_partition_pdf_formats_languages_for_tesseract(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name="detectron2_onnx", + model_name=pdf.default_hi_res_model(), ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index a0c907aad3..e8404537c1 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -25,6 +25,7 @@ from unstructured.partition import auto from unstructured.partition.auto import _get_partition_with_extras, partition from unstructured.partition.common import convert_office_doc +from unstructured.partition.pdf import default_hi_res_model from unstructured.staging.base import elements_to_json DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -382,7 +383,7 @@ def test_auto_partition_formats_languages_for_tesseract(): ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert", ocr_mode="entire_page", extract_tables=False, - model_name="detectron2_onnx", + model_name=default_hi_res_model(), ) diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index b37185fd27..79e18453bf 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -57,7 +57,7 @@ "text": "Lisa Federer, MLIS, Data Science Training Coordinator" }, { - "type": "Title", + "type": "NarrativeText", "element_id": "d9644fb4b85468d186b132c91ca64f31", "metadata": { "data_source": { @@ -77,7 +77,7 @@ }, { "type": "Title", - "element_id": "53d548aa01fc3eb72da15a5be7f235e2", + "element_id": "c8e51fdc53c202393adad77f7f93ee5a", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -115,7 +115,7 @@ }, { "type": "ListItem", - "element_id": "d94c6241299e6eff20ee6499cb9f64de", + "element_id": "04ff84b51fab69c07381ac794b740243", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -130,11 +130,87 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science; 2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python); 3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science; 4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science. 5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." + "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;" }, { - "type": "UncategorizedText", - "element_id": "34b28172088bba51c6764df6d4e87674", + "type": "ListItem", + "element_id": "0b2857001b1a9eba5e46e26cba08e2ac", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python);" + }, + { + "type": "ListItem", + "element_id": "8b02f539eb8ccee5b3fc24f66858188c", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;" + }, + { + "type": "ListItem", + "element_id": "469e981f34d1e6f2b420574ed8e932d2", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science." + }, + { + "type": "ListItem", + "element_id": "4b8fc76cbba0e2fef79ff8bc668b1401", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." + }, + { + "type": "NarrativeText", + "element_id": "69da7754428f154ee3b2906214d31ad9", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -153,7 +229,7 @@ }, { "type": "Title", - "element_id": "89b1f4c3df983454e25b233320781610", + "element_id": "37486ef32cbf05082d5dbff0581db762", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -209,7 +285,7 @@ "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" }, { - "type": "Title", + "type": "NarrativeText", "element_id": "edd5f2f5a60a83c8899e533ac8bcd03c", "metadata": { "data_source": { @@ -247,7 +323,7 @@ "text": "Methodology" }, { - "type": "Title", + "type": "NarrativeText", "element_id": "987542acede56f098db655f02fb814a7", "metadata": { "data_source": { @@ -267,7 +343,7 @@ }, { "type": "ListItem", - "element_id": "fdd38e2d80cc964e9bf3c7e09a760e21", + "element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -282,10 +358,29 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "a) Responses to a 2017 Kaggle' survey’ of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (83.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The" + "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." }, { - "type": "NarrativeText", + "type": "ListItem", + "element_id": "c6865d507571ccb14d37791134f27f61", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." + }, + { + "type": "ListItem", "element_id": "3f14cc0782485365bad0539f7b1bbb22", "metadata": { "data_source": { @@ -324,7 +419,7 @@ }, { "type": "NarrativeText", - "element_id": "8e6dc8d9bc74e032451cc1a6a0da4d10", + "element_id": "f39ddfa6365e505947527153b0ea60d8", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -341,6 +436,25 @@ }, "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" }, + { + "type": "Footer", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "metadata": { + "data_source": { + "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", + "version": 167189396509615428390709838081557906335, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "date_created": "2023-03-10T09:32:44+00:00", + "date_modified": "2023-03-10T09:32:44+00:00" + }, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "2" + }, { "type": "UncategorizedText", "element_id": "d4735e3a265e16eee03f59718b9b5d03", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index 9a30d93103..1aec242c3e 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "720a6f5640af3333283ae0a2b6ef5d4d", + "element_id": "8b115710b659086909de658b116dd719", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -16,11 +16,600 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "a Department of the Treasury Internal Revenue Service" + "text": "a Department of the Treasury Internal Revenue Service Instructions for Form 3115 (Rev. November 1987) Application for Change in Accounting Method" + }, + { + "type": "NarrativeText", + "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "(Section references are to the Internal Revenue Code unless otherwise noted.)" + }, + { + "type": "Title", + "element_id": "61ed58fa51293f429f87e8cf1896c9e4", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Paperwork Reduction Act Notice" + }, + { + "type": "NarrativeText", + "element_id": "b00492d57199616b7b5459cdf57a58d2", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws and to allow us to figure and collect the right amount of tax. You are required to us this information." + }, + { + "type": "NarrativeText", + "element_id": "5d18f0234e23bc96198c9fb19601056a", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "We ask for this information to carry out the" + }, + { + "type": "NarrativeText", + "element_id": "0895a532e404a5c9ea96eac7982d268f", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "give" + }, + { + "type": "Title", + "element_id": "a1547a4ed1611eee44b15e99120fb978", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "General Instructions" + }, + { + "type": "Title", + "element_id": "68a3289177b49b285e133a5267eb355f", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Purpose of Form" + }, + { + "type": "NarrativeText", + "element_id": "fdb8017fc73bdc12f7200dece8b76c99", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "File this form to request a change in your accounting method, including the accounting treatment of any item. If you are requesting a change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods." + }, + { + "type": "NarrativeText", + "element_id": "7e3ae97a65f12ef0bb8b4d6b5f721f54", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "filing taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current revision date of Form 3115)," + }, + { + "type": "Title", + "element_id": "cf9c7aa24a26aac4f0ec4b6395cbfdcc", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "When" + }, + { + "type": "UncategorizedText", + "element_id": "2127f2ab4fc4feb4d32460c8317bf02f", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Form 3115," + }, + { + "type": "UncategorizedText", + "element_id": "e53657178cb6855ac4b2029197a64b0c", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "A." + }, + { + "type": "NarrativeText", + "element_id": "faf2673a7d6b6f7c5bf7cae6770a4130", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Generally, applicants must complete Section In addition, complete the appropriate sections (B-1 through H) for which a change Is desired." + }, + { + "type": "NarrativeText", + "element_id": "bf2a070cb9d03d056e70b26bebf1ef79", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "You must give all relevant facts, including a detailed description of your present and proposed methods. You must also state the reason(s) you believe approval to make the requested change should be granted. Attach additional pages if more space is needed for explanations. Each page should show your name, address, and identifying number." + }, + { + "type": "NarrativeText", + "element_id": "10626f80b0f7b25e661f8f82f5d7c454", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "State whether you desire a conference in National Office if the Service proposes to disapprove your application." + }, + { + "type": "Title", + "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "the" + }, + { + "type": "Title", + "element_id": "242a9dba10a04654d4adef9c58ff96f6", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" + }, + { + "type": "NarrativeText", + "element_id": "582deac2def308ecc5250773e1683052", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required." + }, + { + "type": "NarrativeText", + "element_id": "550f9e99054c657264fb9bb26d3023de", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information." + }, + { + "type": "NarrativeText", + "element_id": "c7c37f80c11190ab9416495a0d9b7c6e", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "you change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." + }, + { + "type": "Title", + "element_id": "093856d810a56c1557ce2b24c65abf3d", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Long-term contracts. —If" + }, + { + "type": "NarrativeText", + "element_id": "4a1ba7ce20dde03bf464633002f14b10", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "are required to" + }, + { + "type": "NarrativeText", + "element_id": "6272a6df76820c927d081a1041e3c079", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these" + }, + { + "type": "Title", + "element_id": "d3eda7d7ed44b4b43fcbfa6f83f6fad3", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "changes." + }, + { + "type": "Title", + "element_id": "5756fb398995bb6518a87637f24f426e", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Time and Place for Filing" + }, + { + "type": "NarrativeText", + "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." + }, + { + "type": "NarrativeText", + "element_id": "9dda11db48254f5e0d0000afb5d1dd9b", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "NarrativeText", + "element_id": "4d063cdbd131401fa29e1d0e824dc017", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "Title", + "element_id": "ea325d761f98c6b73320e442b67f2a35", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "an" + }, + { + "type": "NarrativeText", + "element_id": "c56ebb2883fe0c95b8564fa3969f7010", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "See section 5.03 of Rev. Proc. 84-74 for filing early application." + }, + { + "type": "NarrativeText", + "element_id": "12f877f0bd47f9b761ed7e74be1afacd", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Note: /f this form is being filed in accordance with Rev. Proc. 74-11, see Section G below." }, { "type": "Title", - "element_id": "88591a76b54e47215c0827ae8838ec13", + "element_id": "a4316c02df07840f1beb56609cb09735", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -35,11 +624,30 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Instructions for Form 3115 (Rev. November 1987)" + "text": "Late Applications" }, { "type": "NarrativeText", - "element_id": "4a17cc01a68e2bf011ba1458d70f369a", + "element_id": "02dd043b5686a46b2f03cfe8cf56aae9", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63." + }, + { + "type": "Title", + "element_id": "025a65465b6fd9635316e92633b24c7e", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -54,11 +662,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Application for Change in Accounting Method" + "text": "Identifying Number" }, { "type": "NarrativeText", - "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", + "element_id": "8605ee209656c311cec7ce4b001caab2", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -73,11 +681,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "(Section references are to the Internal Revenue Code unless otherwise noted.)" + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." }, { "type": "Title", - "element_id": "61ed58fa51293f429f87e8cf1896c9e4", + "element_id": "ea325d761f98c6b73320e442b67f2a35", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -92,11 +700,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Paperwork Reduction Act Notice" + "text": "an" }, { "type": "NarrativeText", - "element_id": "828767cbc922e731b59894afba55fe10", + "element_id": "7d82c5876c5c1a3596338ae8cfbd1a50", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -111,11 +719,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws and to allow us to figure and collect the right amount of tax. You are required to give us this information." + "text": "Others.-—The employer identification number applicant other than an individual should be entered in this block." }, { "type": "Title", - "element_id": "a1547a4ed1611eee44b15e99120fb978", + "element_id": "28391d3bc64ec15cbb090426b04aa6b7", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -130,11 +738,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "General Instructions" + "text": "of" }, { "type": "Title", - "element_id": "68a3289177b49b285e133a5267eb355f", + "element_id": "f1a73e2204a114077f988c9da98d7f8b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -149,11 +757,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Purpose of Form" + "text": "Signature" }, { "type": "NarrativeText", - "element_id": "2ef3cbc8d359155433a0028e73251f95", + "element_id": "dc1531183c8e3f45a78f110ec1efe15f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -168,11 +776,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "File this form to request a change in your accounting method, including the accounting treatment of any item. If you are requesting a change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods. When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current revision date of Form 3115)," + "text": "Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign." }, { "type": "NarrativeText", - "element_id": "84e7e32f584e2ee9f47ba593bf86c559", + "element_id": "7d3a67d75914a504a52ec53998b796af", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -187,11 +795,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Generally, applicants must complete Section A. In addition, complete the appropriate sections (B-1 through H) for which a change Is desired." + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.”" }, { "type": "NarrativeText", - "element_id": "ed7dba38aff5b289c7b6c8a58e800279", + "element_id": "9de285e8e3b042aa9ac86edde98a21a9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -206,11 +814,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "You must give all relevant facts, including a detailed description of your present and proposed methods. You must also state the reason(s) you believe approval to make the requested change should be granted. Attach additional pages if more space is needed for explanations. Each page should show your name, address, and identifying number. State whether you desire a conference in the National Office if the Service proposes to disapprove your application." + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation." }, { "type": "Title", - "element_id": "242a9dba10a04654d4adef9c58ff96f6", + "element_id": "f5ea55c27511707a88f8efadcdf50b55", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -225,11 +833,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" + "text": "Fiduciaries.—The-form" }, { "type": "NarrativeText", - "element_id": "0b320308ba52d4a9625d29cadfc941a9", + "element_id": "ca02af326f3caed052e30728481fc4fe", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -244,11 +852,30 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required. Disregard the instructions under Time and" + "text": "should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title." }, { "type": "NarrativeText", - "element_id": "eb076cfd3d47e546c28611750afedc49", + "element_id": "52e2b8e4b8527ae448e9db2dfd0c43c7", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + }, + { + "type": "Title", + "element_id": "ca978112ca1bbdcafac231b39a23dc4d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -263,11 +890,30 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and" + "text": "a" }, { "type": "NarrativeText", - "element_id": "ee134711b01cac75692565ae4f785fd4", + "element_id": "12a24aabbcef2cabc07babe12d9c82c5", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "If the individual or firm is also authorized to represent the applicant before the IRS, receive copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + }, + { + "type": "Title", + "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -282,11 +928,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information." + "text": "Affiliated Groups" }, { - "type": "ListItem", - "element_id": "7b7c33680de5c4a7cb165c103752579e", + "type": "NarrativeText", + "element_id": "58e977f2200b46ac8b372586dfd781bf", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -301,11 +947,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." + "text": "Taxpayers that are members of an affiliated group filing a consolidated return that seeks to change to the same accounting method for more than one member of the group must file a separate Form 3115 for each such member," }, { "type": "Title", - "element_id": "5756fb398995bb6518a87637f24f426e", + "element_id": "8b838d95f7d4f66b5453307de1353ff4", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -320,11 +966,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Time and Place for Filing" + "text": "Specific Instructions" }, { "type": "Title", - "element_id": "af8bdf713f162b09567c8d1a3a2d4de7", + "element_id": "bc272940e494acf9441070d3eb4b79f6", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -339,11 +985,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change." + "text": "Section A" }, { "type": "NarrativeText", - "element_id": "2aebd5bbfbc983d52ed7aee8eb7bc7cc", + "element_id": "b57b7502430c59194bb865cfa1bcfab5", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -358,11 +1004,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224. You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a)." }, { "type": "NarrativeText", - "element_id": "0ec978b05caa71414e2f4429b1d18f09", + "element_id": "9eefeb9556d95a8dd563ff3270cae7f4", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -377,11 +1023,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "See section 5.03 of Rev. Proc. 84-74 for filing an early application." + "text": "Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority." }, { - "type": "Title", - "element_id": "12f877f0bd47f9b761ed7e74be1afacd", + "type": "NarrativeText", + "element_id": "3e63f740940cd3ab94c17d2bbf48b13a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -396,11 +1042,49 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Note: /f this form is being filed in accordance with Rev. Proc. 74-11, see Section G below." + "text": "Item 7b, page 2.—If item 7b 1s “Yes,” indicate on a separate sheet the following for each separate trade or business: Nature of business" + }, + { + "type": "NarrativeText", + "element_id": "3db206c935841c3dcd5b3a1d41e56b84", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(manufacturing, retailer, wholesaler, etc.), employer identification number, overall method of accounting, and whether, in the last 6 years, that business has changed its accounting method, or is also changing its accounting method as part of this request or as a separate request." + }, + { + "type": "NarrativeText", + "element_id": "48ddf405e03a362566cdbc32cc5cd11c", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.pdf", + "version": 307846589923949318200712033143133817358, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.pdf" + }, + "date_created": "2023-03-10T09:36:30+00:00", + "date_modified": "2023-03-10T09:36:30+00:00" + }, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that:" }, { "type": "Title", - "element_id": "a4316c02df07840f1beb56609cb09735", + "element_id": "28391d3bc64ec15cbb090426b04aa6b7", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -413,13 +1097,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Late Applications" + "text": "of" }, { "type": "NarrativeText", - "element_id": "02dd043b5686a46b2f03cfe8cf56aae9", + "element_id": "81f087b1fcf4c9870324336c6bc0de78", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -432,13 +1116,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63." + "text": "(1) Gives your best estimate of the percentage the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and" }, { "type": "Title", - "element_id": "025a65465b6fd9635316e92633b24c7e", + "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -451,13 +1135,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Identifying Number" + "text": "the" }, { "type": "NarrativeText", - "element_id": "8605ee209656c311cec7ce4b001caab2", + "element_id": "cde0777402fde810d0fb24b15df92b2b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -470,13 +1154,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." + "text": "(2) Explains in detail why you cannot provide requested information." }, { "type": "NarrativeText", - "element_id": "742730130f9c14403ad272eec208a456", + "element_id": "c855d896f610600602f04d9e31253c91", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -489,13 +1173,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block." + "text": "See section 5.06(2) of Rev. Proc. 84-74 for required perjury statement that must be attached." }, { "type": "Title", - "element_id": "f1a73e2204a114077f988c9da98d7f8b", + "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -508,13 +1192,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Signature" + "text": "the" }, { - "type": "ListItem", - "element_id": "ede9004eceddf828c2c928f62d0687a0", + "type": "Title", + "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -527,13 +1211,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6. If the individual or firm is also authorized to" + "text": "the" }, { - "type": "Title", - "element_id": "1df7107903f249d938fbf3710f50283a", + "type": "NarrativeText", + "element_id": "1734a701c8a3139ddcb5b857f697318f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -546,13 +1230,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "If the individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + "text": "If IRS later examines your return for the year change or for later years, it has the right to verify your statement at that time." }, { "type": "Title", - "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283", + "element_id": "28391d3bc64ec15cbb090426b04aa6b7", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -565,13 +1249,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Affiliated Groups" + "text": "of" }, { "type": "NarrativeText", - "element_id": "58e977f2200b46ac8b372586dfd781bf", + "element_id": "751abc8c6a0fa412c3e8c18345f57f95", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -584,13 +1268,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Taxpayers that are members of an affiliated group filing a consolidated return that seeks to change to the same accounting method for more than one member of the group must file a separate Form 3115 for each such member," + "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." }, { "type": "Title", - "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "element_id": "136a59b0c53731bc299206fda46e0888", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -603,13 +1287,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Specific Instructions Section A" + "text": "Section B-1" }, { "type": "NarrativeText", - "element_id": "33b0dd2cec2ea60810343af08d53ded2", + "element_id": "e4a695ea83818204438fe08add6d1554", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -622,13 +1306,13 @@ "date_modified": "2023-03-10T09:36:30+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a). Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority. Item 7b, page 2.—If item 7b 1s “Yes,” indicate on a separate sheet the following for each separate trade or business: Nature of business" + "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application." }, { - "type": "NarrativeText", - "element_id": "c51052c424ee3b8b5a219015f66d4846", + "type": "Title", + "element_id": "f63f53aab435b8c9789ab7d6b982db3f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -643,11 +1327,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(manufacturing, retailer, wholesaler, etc.), employer identification number, overall method of accounting, and whether, in the last 6 years, that business has changed its accounting method, or is also changing its accounting method as part of this request or as a separate request. Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that:" + "text": "Sections B-2 and B-3" }, { "type": "NarrativeText", - "element_id": "1bbe995811e9fd4c3ce1b218cb641f4e", + "element_id": "eac562ca19f6198691856c695e2790bd", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -662,11 +1346,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) Gives your best estimate of the percentage of the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and" + "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities." }, { "type": "NarrativeText", - "element_id": "f7872ac379aa024934461d08fa31ebd9", + "element_id": "e5bed7fe04dd22cabe5e5c0362d37743", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -681,11 +1365,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(2) Explains in detail why you cannot provide the requested information." + "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—" }, { "type": "NarrativeText", - "element_id": "2de8f0b5003bcb8c12a4dc59c8e1f740", + "element_id": "69bd87b2ad5873c030748e62adf61b89", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -700,11 +1384,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "See section 5.06(2) of Rev. Proc. 84-74 for the required perjury statement that must be attached." + "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method." }, { "type": "NarrativeText", - "element_id": "678ecc0340dc8848f891bf12a555a3fd", + "element_id": "44902073e7cc4fa753f25d40e009dcef", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -719,11 +1403,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "If IRS later examines your return for the year of the change or for later years, it has the right to verify your statement at that time." + "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death." }, { "type": "NarrativeText", - "element_id": "751abc8c6a0fa412c3e8c18345f57f95", + "element_id": "b68a5b5b0d59122e0df42a96d68d2b5e", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -738,11 +1422,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable." + "text": "(3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period." }, { "type": "NarrativeText", - "element_id": "64758ada28beed36481b14ce8dc67472", + "element_id": "a50ed92585ec98497171f56bc829c16a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -757,7 +1441,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "substantially all of the stock of which is owned by employees performing the services, retired employees who had performed the services, any estate of any individual who had performed the services listed above, or any person who acquired stock of the corporation as a result of the death of an employee or retiree described above if the acquisition occurred within 2 years of death. (3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period. For more information, see section 448 and Temporary Regulations section 1.448-1T." + "text": "For more information, see section 448 and Temporary Regulations section 1.448-1T." }, { "type": "Title", @@ -779,8 +1463,8 @@ "text": "Section C" }, { - "type": "Title", - "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", + "type": "NarrativeText", + "element_id": "a9e8c96063f3fea7ea05eb3cd41ebe7a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -795,11 +1479,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section E" + "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change." }, { - "type": "ListItem", - "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", + "type": "NarrativeText", + "element_id": "7e90b155b5cdb2481b1dfbb1118142c5", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -814,11 +1498,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." + "text": "Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8." }, { - "type": "ListItem", - "element_id": "84cea2af17bb3760234b42f4ea78e175", + "type": "Title", + "element_id": "1e3abf61a37e3cad36b11b459b1cc39e", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -833,11 +1517,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460. Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." + "text": "If" }, { - "type": "Title", - "element_id": "136a59b0c53731bc299206fda46e0888", + "type": "NarrativeText", + "element_id": "bbd0f86d34b7622cfff546da0c15584d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -852,11 +1536,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section B-1" + "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:" }, { "type": "NarrativeText", - "element_id": "e4a695ea83818204438fe08add6d1554", + "element_id": "347f638641329c72c971a522ec07f6b1", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -871,11 +1555,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application." + "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market)." }, { - "type": "Title", - "element_id": "f63f53aab435b8c9789ab7d6b982db3f", + "type": "NarrativeText", + "element_id": "aca21cfeadca7d527dd36f01005ff44a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -890,11 +1574,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Sections B-2 and B-3" + "text": "(2) proposed and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726." }, { "type": "Title", - "element_id": "4688916bf1d6b205af02a0e954156688", + "element_id": "e850deb3f1e65c13e7cd728279a472bf", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -909,11 +1593,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C" + "text": "State whether the" }, { - "type": "NarrativeText", - "element_id": "aaf93c2be8f4f2db87bd760783fedfa5", + "type": "Title", + "element_id": "fd3dfa76050e048e229d35a01da6974a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -928,11 +1612,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities." + "text": "identification" }, { - "type": "NarrativeText", - "element_id": "e5bed7fe04dd22cabe5e5c0362d37743", + "type": "Title", + "element_id": "a7e2d26e8d15814dd9c6a1bdc90585c8", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -947,11 +1631,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—" + "text": "by" }, { - "type": "ListItem", - "element_id": "69bd87b2ad5873c030748e62adf61b89", + "type": "NarrativeText", + "element_id": "4a9430201a20b0868ab81c8c9e71b881", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -966,11 +1650,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method." + "text": "(3) The termination event statement required section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event." }, { - "type": "NarrativeText", - "element_id": "6d2d2cfa00e5a8caec71ba799f60f8c6", + "type": "Title", + "element_id": "92e21a61e1d872dbbe3e3221a920b409", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -985,11 +1669,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Applicants must give complete details about the present method of valuing inventory and the proposed method. State whether all or part of your inventory ts involved in the change. Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8." + "text": "Section D" }, { - "type": "NarrativeText", - "element_id": "357d52f500b965abc29ea60039de4fd8", + "type": "Title", + "element_id": "8d6743276d5bc8e32d0b05ba0b232db8", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1004,11 +1688,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:" + "text": "Section E" }, { "type": "NarrativeText", - "element_id": "1ac3e7aa5a6139bd80f05a7ac1f63ddf", + "element_id": "86fab9f7b35d56a2d48baf0782b7c53d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1023,11 +1707,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(1) The specific types and classes of goods in the LIFO inventories involved in the proposed changes and the comparative value of such Inventories as of the end of the tax year preceding the year of change determined by: (a) the LIFO method, and (b) the proposed method and basis (such as FIFO cost or lower of cost or market). (2) State whether the proposed identification and valuation methods conform to the inventory method currently used with respect to non-LIFO Inventories, if any, or how such method is otherwise consistent with Regulations section 1.4726." + "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete." }, { "type": "NarrativeText", - "element_id": "6028c579dc843bb5aa2c704f46085914", + "element_id": "825f9197a40400f76d2a527e8d7a2c71", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1042,11 +1726,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(3) The termination event statement required by section 5.10 of Rev. Proc. 84-74 and an explanation if there has been a termination event." + "text": "All long-term contracts entered into after February 28, 1986, except for real property construction contracts expected to be completed within 2 years by contractors whose average annual gross receipts for the 3 prior tax years do not exceed $10,000,000, must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section 460." }, { - "type": "Title", - "element_id": "92e21a61e1d872dbbe3e3221a920b409", + "type": "NarrativeText", + "element_id": "dcf589bb37d079ecce4b375abc332606", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1061,7 +1745,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section D" + "text": "Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts." }, { "type": "Title", @@ -1084,7 +1768,7 @@ }, { "type": "NarrativeText", - "element_id": "fa41a857716f30d6bbee384eada72a90", + "element_id": "cf5e2bc86b7c77533924eb940fd522d5", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1099,11 +1783,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167. Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change. Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)." + "text": "This section Is to be used only to request a change in a method of accounting for depreciation under section 167." }, { - "type": "Title", - "element_id": "a8155ab3bed92cc259ab58331619e0e1", + "type": "NarrativeText", + "element_id": "b8355dc568ea042f9da586188b404bca", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1118,11 +1802,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Section H" + "text": "Rev. Proc. 74-11 provides a procedure whereby applicants are considered to have obtained the consent of the Commissioner to change their method of accounting for depreciation. You must file Form 3115 with the Service Center where your return will be filed within the first 180 days of the tax year in which it is desired to make the change. Attach a copy of the form to the income tax return for the tax year of the change." }, { "type": "NarrativeText", - "element_id": "cb1f664a186a87f6560cde136d70b558", + "element_id": "319882ba6726e29222f5522c53887960", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1137,11 +1821,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested." + "text": "Note: Do not use Form 3115 to make an election under section 168. Such an election may be made only on the tax return for the year in which the property 1s placed in service. In addition, Form 3115 is not to be used to request approval to revoke an election made under section 168. Such a request must be made in accordance with Rev. Proc. 87-1 (updated annually)." }, { - "type": "NarrativeText", - "element_id": "86d11953bb813a770ecd242ff97d4e43", + "type": "Title", + "element_id": "a8155ab3bed92cc259ab58331619e0e1", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1156,11 +1840,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10." + "text": "Section H" }, { "type": "NarrativeText", - "element_id": "0607edfa2419dd0cdc80f457872fe238", + "element_id": "cb1f664a186a87f6560cde136d70b558", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1175,11 +1859,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law," + "text": "Generally, this section should be used for requesting changes In a method of accounting for which provision has not been made elsewhere on this form. Attach additional pages if more space ts needed for a full explanation of the present method used and the proposed change requested." }, { "type": "NarrativeText", - "element_id": "50d16fd6b40a428c3befaf6dd19c2dcd", + "element_id": "86d11953bb813a770ecd242ff97d4e43", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1194,11 +1878,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)" + "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10." }, { "type": "NarrativeText", - "element_id": "a8e72799229bc2d754f44ea167a6e7d6", + "element_id": "df67e4b3a4a1352209c2648b87d675e2", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1213,11 +1897,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Applicants requesting to change their method of valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods." + "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)" }, { - "type": "UncategorizedText", - "element_id": "c0a5f5aa4012d18970939d7bb8299e38", + "type": "NarrativeText", + "element_id": "3167823c1d2039b4c48efe2f6c89b5c2", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1232,11 +1916,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "% U.S." + "text": "Applicants requesting change valuing property produced, property acquired for resale, or long-term contracts under section 263A or 460 MUST complete section D showing the treatment under both the present and proposed methods." }, { "type": "Title", - "element_id": "c71e90d2f497062ba8d068af0bed2a3d", + "element_id": "663ea1bfffe5038f3f0cf667f14c4257", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1251,11 +1935,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Government" + "text": "to" }, { "type": "Title", - "element_id": "c0f169737344e28e87eb123df627ba6a", + "element_id": "7574058dd32c12eb33bc649b5e36bdcb", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1270,11 +1954,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Printing" + "text": "their method of" }, { - "type": "Title", - "element_id": "749720aad1daf3c5dfeda1d87555ff87", + "type": "UncategorizedText", + "element_id": "bbf3f11cb5b43e700273a78d12de55e4", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1289,11 +1973,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Office:" + "text": "%" }, { - "type": "UncategorizedText", - "element_id": "de444aa0e8db0c05d86ad56e28d5fb26", + "type": "NarrativeText", + "element_id": "4bde94dc330268d2f63a09423409c6d4", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", @@ -1308,7 +1992,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "1987—201-993/60166" + "text": "U.S. Government Printing Office: 1987—201-993/60166" }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 2034680177..6874f8d561 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "9e4a454d91ac1f220324c6d1a0377093", + "element_id": "92405c82f76df8b2cbbc6047bd10e0ff", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -16,11 +16,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "rh Department of the Treasury Internal Revenue Service" + "text": "rh Department of the Treasury Internal Revenue Service Instructions for Form 3115 (Rev. November 1987) Application for Change in Accoun ig Method" }, { - "type": "Title", - "element_id": "88591a76b54e47215c0827ae8838ec13", + "type": "NarrativeText", + "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -35,11 +35,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Instructions for Form 3115 (Rev. November 1987)" + "text": "(Section references are to the Internal Revenue Code unless otherwise noted.)" }, { - "type": "Title", - "element_id": "f91d5fcc0fb964060b132e98f23cf182", + "type": "UncategorizedText", + "element_id": "e16bce609163ec96985ae522ca81502a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -54,11 +54,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Application for Change in Accoun ig Method" + "text": "‘A." }, { "type": "NarrativeText", - "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9", + "element_id": "c9bc33e913a25aaffa8367aa11bc8ed9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -73,7 +73,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "(Section references are to the Internal Revenue Code unless otherwise noted.)" + "text": "Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to this information." }, { "type": "Title", @@ -96,7 +96,26 @@ }, { "type": "NarrativeText", - "element_id": "4660422c06dddc914ab634c5e4045dec", + "element_id": "5d18f0234e23bc96198c9fb19601056a", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "We ask for this information to carry out the" + }, + { + "type": "NarrativeText", + "element_id": "84ab8a2c9ef5f989df144a0ca4576c45", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -111,7 +130,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to give us this information." + "text": "give us" }, { "type": "Title", @@ -172,7 +191,45 @@ }, { "type": "NarrativeText", - "element_id": "b3859f2f29884b1d3ba0892e52859a99", + "element_id": "06658399dddcd1d4d4fda8f9fa90fd53", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "filing taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)" + }, + { + "type": "Title", + "element_id": "cf9c7aa24a26aac4f0ec4b6395cbfdcc", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "When" + }, + { + "type": "UncategorizedText", + "element_id": "2127f2ab4fc4feb4d32460c8317bf02f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -187,11 +244,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)" + "text": "Form 3115," }, { "type": "NarrativeText", - "element_id": "e5a95dc10d4071983b70898a21f11175", + "element_id": "067f3707c33a901f968188d9592065e9", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -206,7 +263,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired." + "text": "Generally, applicants must complete Section In addition, complete the appropriate sections (B:1 through H) for which a change is desired." }, { "type": "NarrativeText", @@ -266,8 +323,27 @@ "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986" }, { - "type": "Title", - "element_id": "11c98a9cbd6a200fbc5b93fed15007ac", + "type": "NarrativeText", + "element_id": "c10c0c63b05172dff854d1d0e570c588", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required" + }, + { + "type": "NarrativeText", + "element_id": "fc2252774c86adc22225761fc0bee985", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -282,11 +358,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Uniform capitalization rules and limitation on" + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3118 eg. “Automatic Change to Accrual Method—Section 448”). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information" }, { "type": "NarrativeText", - "element_id": "b07efea243933525e9ec04a90622508d", + "element_id": "dbf06d87f9be9871dfd64bd0a7bba567", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -301,11 +377,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "cash method.—If you are required to change your method of accounting under section, 263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (imiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\"Act\"), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to cchange from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required" + "text": "change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." }, { "type": "NarrativeText", - "element_id": "39458f370b98a606db29ac6dee975e07", + "element_id": "03c4a83e399f2f669047b3fcfeae5867", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -320,11 +396,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and" + "text": "Long-term contracts.—If you are required to" }, { "type": "NarrativeText", - "element_id": "663dd3791cc24190a45998ca7914f88e", + "element_id": "463ce4107785bb9854ad10b81d93dc7f", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -339,11 +415,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3118 eg. “Automatic Change to Accrual Method—Section 448”). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information" + "text": "Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these" }, { - "type": "ListItem", - "element_id": "4e4069c49822cae18add18758619535b", + "type": "Title", + "element_id": "d3eda7d7ed44b4b43fcbfa6f83f6fad3", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -358,7 +434,7 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." + "text": "changes." }, { "type": "Title", @@ -381,7 +457,7 @@ }, { "type": "NarrativeText", - "element_id": "83042962477fa38e403e861f8edfdd4b", + "element_id": "7941057d83c91b25cee4374b3ab06eaa", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -396,11 +472,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Generally, applicants must file this form within the first 180 days of the tax year in which itis desired to make the change. Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224. You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + "text": "Generally, applicants must file this form within the first 180 days of the tax year in which itis desired to make the change." }, { "type": "NarrativeText", - "element_id": "df0e66d1a434e95e4051ddcb968c94c9", + "element_id": "9dda11db48254f5e0d0000afb5d1dd9b", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -415,7 +491,83 @@ "filetype": "image/png", "page_number": 1 }, - "text": "See section 5.03 of Rev. Proc. 84-74 for filing an early application, Note: If this form is being filed in accordance with Rey. Proc. 74-11, see Section G below." + "text": "Taxpayers, other than exempt organizations, should file Form 3115 with the Commissioner of Internal Revenue, Attention: CC:C:4, 1111 Constitution Avenue, NW, Washington, DC 20224, Exempt organizations should file with the Assistant Commissioner (Employee Plans and Exempt Organizations), 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "NarrativeText", + "element_id": "4d063cdbd131401fa29e1d0e824dc017", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224." + }, + { + "type": "Title", + "element_id": "ea325d761f98c6b73320e442b67f2a35", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "an" + }, + { + "type": "NarrativeText", + "element_id": "e3e2ccf4f0d1524d4f5ce42e8f2d1efa", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "See section 5.03 of Rev. Proc. 84-74 for filing early application," + }, + { + "type": "NarrativeText", + "element_id": "11cb901986e9621aadbd76e6f7400809", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Note: If this form is being filed in accordance with Rey. Proc. 74-11, see Section G below." }, { "type": "Title", @@ -493,9 +645,85 @@ }, "text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both." }, + { + "type": "Title", + "element_id": "ea325d761f98c6b73320e442b67f2a35", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "an" + }, + { + "type": "NarrativeText", + "element_id": "e72d9c8a779a47796c4362b7885aa80b", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Others.-—The employer identification number applicant other than an individual should be entered in this block," + }, + { + "type": "Title", + "element_id": "28391d3bc64ec15cbb090426b04aa6b7", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "of" + }, + { + "type": "Title", + "element_id": "6a7faddb3ac8b6e14dad65f081428865", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Signature Individieale" + }, { "type": "NarrativeText", - "element_id": "9240bfa889b87dc2fb3fa746ca4eeeb4", + "element_id": "48cd565f152ff17bab8eba19eb23db34", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -510,11 +738,49 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block," + "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should" }, { "type": "Title", - "element_id": "55d4f33b09f24dd3b27865a5f34bfeb9", + "element_id": "0b6f395ca14ac202374d5cff678b7115", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "sign" + }, + { + "type": "NarrativeText", + "element_id": "7d3a67d75914a504a52ec53998b796af", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.”" + }, + { + "type": "NarrativeText", + "element_id": "ee6a9bcef7e5e33bc26f419812e2c77a", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -529,11 +795,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Signature tea" + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation," }, { - "type": "ListItem", - "element_id": "f8e8c87d2e958a23153d7f25b159f0ee", + "type": "NarrativeText", + "element_id": "ba7f9dc18be2bf9219e020112b426526", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -548,11 +814,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + "text": "Fiduciaries.—The-form should show the" }, { "type": "NarrativeText", - "element_id": "35f1273e073cf159019550bc35b6692c", + "element_id": "e3c8d21cabd10cc36b53107e58a5be8d", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -567,7 +833,64 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Ifthe individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." + "text": "name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle." + }, + { + "type": "NarrativeText", + "element_id": "52e2b8e4b8527ae448e9db2dfd0c43c7", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6." + }, + { + "type": "Title", + "element_id": "ca978112ca1bbdcafac231b39a23dc4d", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "a" + }, + { + "type": "NarrativeText", + "element_id": "8200352b4e91b1be4f14e9248d50380a", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Ifthe individual or firm is also authorized to represent the applicant before the IRS, receive copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)." }, { "type": "Title", @@ -609,7 +932,64 @@ }, { "type": "Title", - "element_id": "58703de56debc34a1d68e6ed6f8fd067", + "element_id": "8b838d95f7d4f66b5453307de1353ff4", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Specific Instructions" + }, + { + "type": "Title", + "element_id": "bc272940e494acf9441070d3eb4b79f6", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Section A" + }, + { + "type": "NarrativeText", + "element_id": "a6c53a8898025076b8c0397178f95fa3", + "metadata": { + "data_source": { + "url": "abfs://container1/IRS-form-1987.png", + "version": 328871203465633719836776597535876541325, + "record_locator": { + "protocol": "abfs", + "remote_file_path": "container1/IRS-form-1987.png" + }, + "date_created": "2023-03-10T09:44:55+00:00", + "date_modified": "2023-03-10T09:44:55+00:00" + }, + "filetype": "image/png", + "page_number": 1 + }, + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a)" + }, + { + "type": "NarrativeText", + "element_id": "e9278d083996ccb1f39236b8064b28cd", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -624,11 +1004,11 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Specific Instructions Section A" + "text": "Item 6, page 2.—The term “gross receipts” Includes total sales (net of returns and allowances) and all amounts received for services. in addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, tunder the applicable state or local law, the taxis legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority." }, { "type": "NarrativeText", - "element_id": "5e7793489f88d7c9187dad66e787898f", + "element_id": "4b4424f821633ea87deab36702d4c113", "metadata": { "data_source": { "url": "abfs://container1/IRS-form-1987.png", @@ -643,6 +1023,6 @@ "filetype": "image/png", "page_number": 1 }, - "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a) Item 6, page 2.—The term “gross receipts” Includes total sales (net of returns and allowances) and all amounts received for services. in addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, tunder the applicable state or local law, the taxis legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority. Item 7b, page 2.—If item 7b 1s \"Yes,\" indicate ona separate sheet the following for each separate trade or business: Nature of business" + "text": "Item 7b, page 2.—If item 7b 1s \"Yes,\" indicate ona separate sheet the following for each separate trade or business: Nature of business" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 35d4a581e4..399ba6c1ab 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -1,7 +1,7 @@ [ { - "type": "UncategorizedText", - "element_id": "0e58869830c7b4461a4d1879223e4139", + "type": "Header", + "element_id": "c1f4b5ba045830c1866db8f8aa0b54ac", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -11,7 +11,7 @@ }, { "type": "NarrativeText", - "element_id": "b0658ce9dccc0acba9a472c2bb992cc9", + "element_id": "869adddb184177031536477262e0dde0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "Title", - "element_id": "f2fe9c33b7e8535efebf7c20ebce297c", + "element_id": "e6fa42b5b4d85001b900e47c050b645b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "Title", - "element_id": "0ca3f075fdccf9232449ff461b63ceb9", + "type": "NarrativeText", + "element_id": "9234133787d0a6b3976b16569c0b5cf3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -40,8 +40,8 @@ "text": "journal homepage: www.elsevier.com/locate/dib" }, { - "type": "Title", - "element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d", + "type": "NarrativeText", + "element_id": "ac01687ab870e4bb6e7313db4654928a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -70,7 +70,7 @@ "text": "(Jee" }, { - "type": "NarrativeText", + "type": "Title", "element_id": "4f14d967ea87a75ad1acee27ff34e59e", "metadata": { "data_source": {}, @@ -90,864 +90,434 @@ "text": "Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa" }, { - "type": "NarrativeText", - "element_id": "fbd221e3c1f82c8601661213b98b0962", + "type": "Title", + "element_id": "3d71760ba4f1cc95873ee36178f97d82", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "a r t i c l e i n f o" + "text": "ARTICLE INFO" }, { "type": "NarrativeText", - "element_id": "d6923075e35e5f3296e0d24ceb70a2bb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "a b s t r a c t" - }, - { - "type": "UncategorizedText", - "element_id": "c382dd715a85d683f056834c4af7be85", + "element_id": "fbd221e3c1f82c8601661213b98b0962", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018" + "text": "a r t i c l e i n f o" }, { "type": "Title", - "element_id": "abe4641521caf8385f30e81099f3a8c6", + "element_id": "3d1626989d3e923485561f1e5bdeaa58", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid" + "text": "ABSTRACT" }, { "type": "NarrativeText", - "element_id": "26c73759c3d3cc29d683910c034432da", + "element_id": "d6923075e35e5f3296e0d24ceb70a2bb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration." + "text": "a b s t r a c t" }, { "type": "NarrativeText", - "element_id": "260cf1397ece5718c2d35900917688de", + "element_id": "4a03002c97925cd9397927ac823369e7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." - }, - { - "type": "Title", - "element_id": "8c625bd30cfb1b77c8ba8d4e863d0bb3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Specification table" - }, - { - "type": "Title", - "element_id": "ac89a2886224c42ad15982cd34421ff8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Subject area More specific subject area Surface science and engineering Type of data" + "text": "Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018" }, { "type": "NarrativeText", - "element_id": "0a789b33a0101a46f5a01d22d9a6ce2b", + "element_id": "08bb309957586c280660c11c337dc6d7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "* Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za E-mail address: tayo.sanni@yahoo.com (O. Sanni)." + "text": "Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid" }, { "type": "NarrativeText", - "element_id": "511abaee4573f467ba654d2a697efb03", + "element_id": "26c73759c3d3cc29d683910c034432da", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." - }, - { - "type": "UncategorizedText", - "element_id": "549a2fac47d713cc00f2db498ad6b557", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "452" + "text": "This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration." }, { "type": "NarrativeText", - "element_id": "9ca201e648ed74cfc838b6661f59addf", + "element_id": "62e4907f12a32a7b9ccd57ed477eb54a", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" + "text": "reactions responsible for corrosion and surface deterioration. © 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license" }, { "type": "NarrativeText", - "element_id": "6928b78d26af54b6acb804ed319b5c05", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "How data were acquired" - }, - { - "type": "Title", - "element_id": "41e0fa358cefcadbb2633ec45ff2d129", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Data format Experimental factors" - }, - { - "type": "Title", - "element_id": "9769bb23c49762a84b464e817703bc35", + "element_id": "260cf1397ece5718c2d35900917688de", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "Experimental features Data source location" + "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, { "type": "Title", - "element_id": "aa43321cb45fb76debb7e732249d42a5", + "element_id": "5abba9b1f2c341e0b299fa43a90d0e14", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "Accessibility Related research article" + "text": "Specification table" }, { - "type": "ListItem", - "element_id": "82bf7851faa53c3a4965d4cdfe8d0bce", + "type": "NarrativeText", + "element_id": "ac89a2886224c42ad15982cd34421ff8", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230." + "text": "Subject area More specific subject area Surface science and engineering Type of data" }, { "type": "Title", - "element_id": "596eda178f8c5adefbae7cfe1bec78c3", + "element_id": "e4359c72057b318ddf5a64f9b97539c4", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "Value of the data" + "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za" }, { "type": "ListItem", - "element_id": "7def44ffc91f3f064b85dc04b23767ec", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment. © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments. © The data can be used to examine the relationship between the process variable as it affect the nature of inhibition of metals." - }, - { - "type": "Title", - "element_id": "1c3f3de4e65aae5bd147f84779712a65", + "element_id": "6190ca95b973d4a03fdf4c3b0b260af0", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "1. Data" + "text": "Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za E-mail address: tayo.sanni@yahoo.com (O. Sanni)." }, { "type": "NarrativeText", - "element_id": "5034c7315aface0b263361d0eae1dd15", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule" - }, - { - "type": "Title", - "element_id": "e28e0dc941accc8694040c63091b580c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": ") g m" - }, - { - "type": "UncategorizedText", - "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(" - }, - { - "type": "Title", - "element_id": "b780e72bd4f737713ae202feb46b5d55", + "element_id": "511abaee4573f467ba654d2a697efb03", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 2 + "page_number": 1 }, - "text": "s s o" + "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/)." }, { - "type": "Title", - "element_id": "acac86c0e609ca906f632b0e2dacccb2", + "type": "Header", + "element_id": "78f135d64d5e1307cac651608256a418", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "l" + "text": "452" }, { - "type": "Title", - "element_id": "1bd621f0b71079e0948b0aad011a7f4b", + "type": "NarrativeText", + "element_id": "9ca201e648ed74cfc838b6661f59addf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "t h g e W" + "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "element_id": "41e0fa358cefcadbb2633ec45ff2d129", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "i" + "text": "Data format Experimental factors" }, { "type": "Title", - "element_id": "b30b3a63451a0f3f43bad0781c1e9ad8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(mg)" - }, - { - "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "30" - }, - { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "20" - }, - { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "element_id": "9769bb23c49762a84b464e817703bc35", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "10" + "text": "Experimental features Data source location" }, { "type": "Title", - "element_id": "d300d49efc4cd0982dd6bc3377759ae8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "10g 8g 6g 4g 2g Control" - }, - { - "type": "UncategorizedText", - "element_id": "98010bd9270f9b100b6214a21754fd33", + "element_id": "aa43321cb45fb76debb7e732249d42a5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "48" + "text": "Accessibility Related research article" }, { - "type": "UncategorizedText", - "element_id": "7b1a278f5abe8e9da907fc9c29dfd432", + "type": "NarrativeText", + "element_id": "6928b78d26af54b6acb804ed319b5c05", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "96" + "text": "How data were acquired" }, { - "type": "UncategorizedText", - "element_id": "5ec1a0c99d428601ce42b407ae9c675e", + "type": "Table", + "element_id": "5eb814dac721c11581f011fbca57a17e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "144" + "text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO, solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225-230." }, { - "type": "UncategorizedText", - "element_id": "eb3be230bbd2844b1f5d8f2e4fab9ffb", + "type": "NarrativeText", + "element_id": "7ce6ee1aa50d28b85acf544f1db3e25c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "192" + "text": "The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230." }, { "type": "Title", - "element_id": "a955dcf1d740ce40d62415d9f16da436", + "element_id": "e63f0ed399f0537c9ffeadfcae3baed6", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Exposure Time (Hours)" + "text": "Value of the data" }, { - "type": "NarrativeText", - "element_id": "45cd54c64e38abe8c1128a5979ca8cd5", + "type": "ListItem", + "element_id": "1daeb29ccbc793481f453c7f76b8795b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Fig. 1. Weight loss versus exposure time for stainless steel immersed in 0.5M H2SO, solution in the absence and presence of ES." - }, - { - "type": "NarrativeText", - "element_id": "9ca201e648ed74cfc838b6661f59addf", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" - }, - { - "type": "UncategorizedText", - "element_id": "d83c7ee736be931d85b78a4a60881ced", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "453" - }, - { - "type": "NarrativeText", - "element_id": "e5d46bc8ceb17f88e1cff33ecac97067", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." - }, - { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i" - }, - { - "type": "NarrativeText", - "element_id": "4f0139b605dfdd9eb93e920a6115e1b5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": ") r a e y / m m" - }, - { - "type": "NarrativeText", - "element_id": "49e7364ce1027887460959b2a757b184", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "( e t a r n o s o r r o C" - }, - { - "type": "NarrativeText", - "element_id": "74599fca46202613cccb12e97774b306", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "E n o i t i b h n I" - }, - { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i" - }, - { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i" - }, - { - "type": "NarrativeText", - "element_id": "bbe120714b80df07396e808f98b3f354", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "y c n e c i f f" - }, - { - "type": "UncategorizedText", - "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(" - }, - { - "type": "UncategorizedText", - "element_id": "bbf3f11cb5b43e700273a78d12de55e4", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "%" - }, - { - "type": "UncategorizedText", - "element_id": "ba5ec51d07a4ac0e951608704431d59a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": ")" - }, - { - "type": "UncategorizedText", - "element_id": "0faf54c7569cac28ec5462f872384f7c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "2.7" - }, - { - "type": "UncategorizedText", - "element_id": "a97b042d7bd59d92a46e8ab17f7dff73", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "1.8" + "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment." }, { - "type": "UncategorizedText", - "element_id": "8139b33952401b3ee0e2ca84651cb9a1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "0.9" - }, - { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "100" - }, - { - "type": "UncategorizedText", - "element_id": "525fbe4b6760bd759bfeeae2ee487f12", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1" - }, - { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "10" - }, - { - "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "30" - }, - { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "20" - }, - { - "type": "UncategorizedText", - "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "90" - }, - { - "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "40" - }, - { - "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "80" - }, - { - "type": "UncategorizedText", - "element_id": "1a6562590ef19d1045d06c4055742d38", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "50" - }, - { - "type": "UncategorizedText", - "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "70" - }, - { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "60" - }, - { - "type": "UncategorizedText", - "element_id": "c2356069e9d1e79ca924378153cfbbfb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "24" - }, - { - "type": "UncategorizedText", - "element_id": "98010bd9270f9b100b6214a21754fd33", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "48" - }, - { - "type": "UncategorizedText", - "element_id": "8722616204217eddb39e7df969e0698a", + "type": "ListItem", + "element_id": "7aad924d1c00e3d50bc0c24beb00a9e5", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "72" + "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments." }, { - "type": "UncategorizedText", - "element_id": "7b1a278f5abe8e9da907fc9c29dfd432", + "type": "NarrativeText", + "element_id": "39b6040280a179e1f8e4f4fb5ec4ae05", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "96" + "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the" }, { - "type": "Title", - "element_id": "239bb77f5ec344ce5e614979b8c49742", + "type": "ListItem", + "element_id": "b6cdef9ac2c39caf23c7413dcdb3c227", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "Exposure time" + "text": "© The data can be used to examine the relationship between the process variable as it affect the nature of inhibition of metals." }, { - "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "type": "Title", + "element_id": "1c3f3de4e65aae5bd147f84779712a65", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "120" + "text": "1. Data" }, { - "type": "UncategorizedText", - "element_id": "5ec1a0c99d428601ce42b407ae9c675e", + "type": "NarrativeText", + "element_id": "5034c7315aface0b263361d0eae1dd15", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "144" + "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs. 1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule" }, { - "type": "UncategorizedText", - "element_id": "80c3cd40fa35f9088b8741bd8be6153d", + "type": "Image", + "element_id": "6cbfbefb10bbbc9b57cd22704824934e", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "168" + "text": "Weight loss (mg) 96 144 192 Exposure Time (Hours)" }, { - "type": "UncategorizedText", - "element_id": "eb3be230bbd2844b1f5d8f2e4fab9ffb", + "type": "Title", + "element_id": "a955dcf1d740ce40d62415d9f16da436", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "192" + "text": "Exposure Time (Hours)" }, { - "type": "Title", - "element_id": "d300d49efc4cd0982dd6bc3377759ae8", + "type": "FigureCaption", + "element_id": "45cd54c64e38abe8c1128a5979ca8cd5", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 3 + "page_number": 2 }, - "text": "10g 8g 6g 4g 2g Control" + "text": "Fig. 1. Weight loss versus exposure time for stainless steel immersed in 0.5M H2SO, solution in the absence and presence of ES." }, { - "type": "UncategorizedText", - "element_id": "85b99d4e3d8e29e46e512f9cca7ba627", + "type": "NarrativeText", + "element_id": "8a54dcaa0e2720786903e26e84bd9e93", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "2g 4g 6g 8g 10g" + "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "Header", + "element_id": "135be522765ce267b8ca6debeeec6dc4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "0" + "text": "453" }, { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "type": "NarrativeText", + "element_id": "4f0139b605dfdd9eb93e920a6115e1b5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "20" + "text": ") r a e y / m m" }, { - "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", + "type": "NarrativeText", + "element_id": "49e7364ce1027887460959b2a757b184", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "40" + "text": "( e t a r n o s o r r o C" }, { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", + "type": "Title", + "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "60" + "text": "i" }, { - "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", + "type": "Image", + "element_id": "84d160dc9075c76de6f6d6c3f2651fe3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "80" + "text": " Corrosion rate (mm/year) 24 48 72 96 120 144 168 192 Exposure time" }, { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", + "type": "Title", + "element_id": "239bb77f5ec344ce5e614979b8c49742", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "100" + "text": "Exposure time" }, { - "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", + "type": "FigureCaption", + "element_id": "e5d46bc8ceb17f88e1cff33ecac97067", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "120" + "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES." }, { "type": "UncategorizedText", - "element_id": "dbae772db29058a88f9bd830e957c695", + "element_id": "ad57366865126e55649ecb23ae1d4888", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "140" + "text": "100" }, { "type": "UncategorizedText", - "element_id": "a512db2741cd20693e4b16f19891e72b", + "element_id": "57e2eb94df928d0cf17b2c0d41ae042e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "160" + "text": "100 4" }, { - "type": "UncategorizedText", - "element_id": "7b69759630f869f2723875f873935fed", + "type": "Image", + "element_id": "0616fd3aee2db0cdd1a1565987b925ae", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "180" + "text": " 80 4 Inhibition Efficiency (%) a Ss 1 _—__. —o— 4g SS v- —a— 6g 74 —~X_ Senn, —y— 8g ~~. —6~ 10g —__, ~ —o- 2g ol, T T T T T T T 1 20 40 60 80 100 120 140 160 180 Exposure Time 1e (Hours)" }, { "type": "Title", @@ -960,7 +530,7 @@ "text": "Exposure Time (Hours)" }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "950ca7babbae92e76df97f7ee57bc05c", "metadata": { "data_source": {}, @@ -980,8 +550,8 @@ "text": "number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor." }, { - "type": "UncategorizedText", - "element_id": "48f89b630677c2cbb70e2ba05bf7a363", + "type": "Header", + "element_id": "8d9bcdac558e606c913189b6ce8db44c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -991,7 +561,7 @@ }, { "type": "NarrativeText", - "element_id": "9ca201e648ed74cfc838b6661f59addf", + "element_id": "8a54dcaa0e2720786903e26e84bd9e93", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1010,8 +580,8 @@ "text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)" }, { - "type": "UncategorizedText", - "element_id": "f0e5c879f7d220552d8ad5b3503bd038", + "type": "FigureCaption", + "element_id": "e8f34726e919c7e2f4d00f6fcf511ef8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1191,73 +761,13 @@ }, { "type": "UncategorizedText", - "element_id": "4a166cad507ccd016e6ad2d8652111e5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "0 / C" - }, - { - "type": "UncategorizedText", - "element_id": "6b51d431df5d7f141cbececcf79edf3d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "12" - }, - { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "10" - }, - { - "type": "UncategorizedText", - "element_id": "2c624232cdd221771294dfbb310aca00", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "8" - }, - { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "6" - }, - { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "4" - }, - { - "type": "UncategorizedText", - "element_id": "a0dfa682f99b0794f40f195f9a7adfcd", + "element_id": "e2b6d7e2ab125149fa820500cedfffbb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "—=—Cc/0 2+ T T T 1" + "text": "—=—Cc/0" }, { "type": "UncategorizedText", @@ -1270,77 +780,17 @@ "text": "C/0" }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2" - }, - { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2" - }, - { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "4" - }, - { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "6" - }, - { - "type": "UncategorizedText", - "element_id": "2c624232cdd221771294dfbb310aca00", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "8" - }, - { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "10" - }, - { - "type": "Title", - "element_id": "c74caf15453477bf544f86e069d90da7", + "type": "Image", + "element_id": "330ac6774a7bcf85ad0993abaab2a475", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Concentration (g)" + "text": " 12 2+ T T T 1 2 4 6 8 10 Concentration (g)" }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "8e9636a780701abc4f16c3f890b8a83f", "metadata": { "data_source": {}, @@ -1350,8 +800,8 @@ "text": "Fig. 5. Langmuir adsorption isotherm of ES." }, { - "type": "NarrativeText", - "element_id": "9ca201e648ed74cfc838b6661f59addf", + "type": "Header", + "element_id": "8a54dcaa0e2720786903e26e84bd9e93", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1360,8 +810,8 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "UncategorizedText", - "element_id": "f626051bc94422f26f4b774a2bca105e", + "type": "Header", + "element_id": "b5c1fe3f2fa0ef8280a53620dcb31175", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1400,7 +850,7 @@ "text": "°@¢Naafe«MgsSEM HY: 20.0KV 7 ETOP LU ULL UL OCT 0BEM IAAG: 400 x a" }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "ccc8ab2aeabd9a0f745b9f0f6fcbef6e", "metadata": { "data_source": {}, @@ -1410,7 +860,7 @@ "text": "Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor." }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "6121f41a05c15afa2efe50af3e838da4", "metadata": { "data_source": {}, @@ -1420,7 +870,7 @@ "text": "Fig. 6. SEM/EDX image of as-received stainless steel." }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "d8bc58d446376a881b51208b9a8ee7b7", "metadata": { "data_source": {}, @@ -1430,8 +880,8 @@ "text": "Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor." }, { - "type": "UncategorizedText", - "element_id": "b3a8e0e1f9ab1bfe3a36f231f676f78b", + "type": "Header", + "element_id": "cdfba543ee8ef7fdb3d8b587648cc22d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1440,8 +890,8 @@ "text": "456" }, { - "type": "NarrativeText", - "element_id": "9ca201e648ed74cfc838b6661f59addf", + "type": "Header", + "element_id": "8a54dcaa0e2720786903e26e84bd9e93", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1451,7 +901,7 @@ }, { "type": "Title", - "element_id": "a80826543c9e0d0e9f6c2108ae3c3f73", + "element_id": "e00efc537994ab576eaec5a387a5ebc0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1460,8 +910,8 @@ "text": "2. Experimental design, materials and methods" }, { - "type": "Title", - "element_id": "90b8c00ff7a1b170a14695aa51629f14", + "type": "NarrativeText", + "element_id": "d277e2ba1e8cbda383b0e51703c281c8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1471,7 +921,7 @@ }, { "type": "NarrativeText", - "element_id": "7c3b7c8c2993a59e71e009d051edd727", + "element_id": "c90848f07a922eff3615e5aa1ee78a2f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1480,8 +930,8 @@ "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9." }, { - "type": "NarrativeText", - "element_id": "1dc2692eee9b01e9a960f80c4dabe07b", + "type": "FigureCaption", + "element_id": "c07eeb615f8b0f2d544348b7f0655301", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1490,8 +940,8 @@ "text": "Fig. 9. Chemical structure of egg shell powder." }, { - "type": "Title", - "element_id": "b4a533760fabf85f66294a0441dacd1e", + "type": "NarrativeText", + "element_id": "63584e8d8b4c14d1542778c155ee4b78", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1520,14 +970,14 @@ "text": "The corrosion rate (CR) was calculated using Eq. (1) [1–5]" }, { - "type": "Title", - "element_id": "cecb8b44c9af4b76e85155170c509729", + "type": "NarrativeText", + "element_id": "1cf628987e0d8ee743a4fd01f662cc01", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "Corrosion rate CRð" + "text": ". 87.6W Corrosion rate(CR) = (ar" }, { "type": "UncategorizedText", @@ -1580,24 +1030,14 @@ "text": "where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (θ) and inhibition efficiencies (IE %) were determined using Eqs. (2) and (3) respectively" }, { - "type": "Title", - "element_id": "62127212535b62092159e4fe305c868d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "θ ¼ CRo (cid:3) CR" - }, - { - "type": "Title", - "element_id": "5a6824cbd64b72c37057f7d1dbee2798", + "type": "Formula", + "element_id": "59664b2fe1b21e796c905c904f07faae", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "CRo" + "text": "~ CRo" }, { "type": "Title", @@ -1610,54 +1050,14 @@ "text": "ð2Þ" }, { - "type": "Title", - "element_id": "c31b73fca4f97bb7e95a3d8634826d32", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "IE ð%Þ ¼ CRo (cid:3) CR" - }, - { - "type": "Title", - "element_id": "c13539d1568999137c4e0354795cd37b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "CR" - }, - { - "type": "Title", - "element_id": "5a6824cbd64b72c37057f7d1dbee2798", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "CRo" - }, - { - "type": "Title", - "element_id": "2d711642b726b04401627ca9fbac32f5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "x" - }, - { - "type": "UncategorizedText", - "element_id": "3a81feba075b8ca26d6f86f392ff06df", + "type": "Formula", + "element_id": "2ceed7a728acd831c0c4c14fc95a3db7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "100 1" + "text": "CRo=CR , 100 IE (0) = CR" }, { "type": "Title", @@ -1671,7 +1071,7 @@ }, { "type": "NarrativeText", - "element_id": "118f0531277e022b44f152b0bf2dee7c", + "element_id": "4e14cf7db9d9e827482861e7576a1d07", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1680,8 +1080,8 @@ "text": "where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively." }, { - "type": "Title", - "element_id": "6aa7f759e077aa037614e7f42897f09a", + "type": "NarrativeText", + "element_id": "5dda1fad7e503afe6240d736d50bbe7a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1710,8 +1110,8 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "UncategorizedText", - "element_id": "353767b239099863e13ca954e20a66c9", + "type": "Header", + "element_id": "b2dc92f9e9858319664f918c69457257", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1721,7 +1121,7 @@ }, { "type": "NarrativeText", - "element_id": "24dcddab57a1cab7266a3c6b536ad2ff", + "element_id": "01f3f73499621b0a04142f29982336c1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1731,7 +1131,7 @@ }, { "type": "Title", - "element_id": "c9015d53b90846454375a2fdf2829c66", + "element_id": "9619869f5960ea0375b649dd8cc388a5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1741,76 +1141,86 @@ }, { "type": "NarrativeText", - "element_id": "63cd602e78daef9ac25a20bbab27ecbc", + "element_id": "dbfead4a6bc5e94c6d8f7de9666b6f30", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "This work was supported by the National Research Foundation of South Africa and the Tshwane" + "text": "This work was supported by the National Research Foundation of South Africa and the Tshwane University of Technology Pretoria South Africa." }, { "type": "Title", - "element_id": "287fb148184f12ff62e9b0207567dac7", + "element_id": "81db7fab0806640b0cbbac862671704f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "University of Technology Pretoria South Africa." + "text": "Transparency document. Supporting information" }, { "type": "NarrativeText", - "element_id": "d202816913e482abce90d70d88f202c3", + "element_id": "eaf72c6c69d317c502026ecf01d28b09", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Transparency document. Supporting information" + "text": "Transparency document associated with this article can be found in the online version at https://doi. org/10.1016/j.dib.2018.11.134." }, { - "type": "NarrativeText", - "element_id": "d434a0e19d0d34e92936b9566e1ebb45", + "type": "Title", + "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 7 + }, + "text": "References" + }, + { + "type": "ListItem", + "element_id": "e275b10ccd88f5d2dbf9f2b2432eb64f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Transparency document associated with this article can be found in the online version at https://doi." + "text": "[1] 0. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results Phys. 9 (2018) 225-230." }, { - "type": "UncategorizedText", - "element_id": "2ca250dde10d732278a9fa586a97e40a", + "type": "ListItem", + "element_id": "5068dd4538c596c1d123fd612bdb99e3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "org/10.1016/j.dib.2018.11.134." + "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1-15." }, { - "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "type": "ListItem", + "element_id": "76eb86296cfb136b12d4606217bd3ae3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "References" + "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel corrosion in chloride solution, Def. Technol. 14 (2018) 463-468." }, { "type": "ListItem", - "element_id": "86174db2f99ff948055caeda83334bb7", + "element_id": "a3b65d4f88d6909004419ec92682d14a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "[1] 0. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results Phys. 9 (2018) 225-230. [2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1-15. [3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel corrosion in chloride solution, Def. Technol. 14 (2018) 463-468. [4] O. Sanni, A.P.I. Popoola, 0.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1-17. https://doi.org/10.1007/ $13632-018-0495-5, [5] O. Sanni, A-P.I. Popoola, O.S.1. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. (lnttps://doi.org/10.7449/2018/MST_2018_254 261)." + "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5." }, { - "type": "NarrativeText", + "type": "ListItem", "element_id": "3cd4caf23cd72a06fbf01b16df13ec1f", "metadata": { "data_source": {}, diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 5d4295f490..d470025d1e 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -1,7 +1,7 @@ [ { - "type": "UncategorizedText", - "element_id": "cfb3400e6eb0487eeb704674d40bf85c", + "type": "Header", + "element_id": "0af8327dc6c8a1694bd0fc75da243db4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -11,7 +11,7 @@ }, { "type": "NarrativeText", - "element_id": "b0658ce9dccc0acba9a472c2bb992cc9", + "element_id": "869adddb184177031536477262e0dde0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "Title", - "element_id": "f2fe9c33b7e8535efebf7c20ebce297c", + "element_id": "e6fa42b5b4d85001b900e47c050b645b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "Title", - "element_id": "0ca3f075fdccf9232449ff461b63ceb9", + "type": "NarrativeText", + "element_id": "9234133787d0a6b3976b16569c0b5cf3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -41,7 +41,7 @@ }, { "type": "Title", - "element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d", + "element_id": "ac01687ab870e4bb6e7313db4654928a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -70,7 +70,7 @@ "text": "(eee" }, { - "type": "NarrativeText", + "type": "Title", "element_id": "adf50fc70e660740d796f43a2ba5f500", "metadata": { "data_source": {}, @@ -80,8 +80,8 @@ "text": "Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b" }, { - "type": "NarrativeText", - "element_id": "dcedfc380a2be599bf69af84d49d4803", + "type": "UncategorizedText", + "element_id": "cd24a5d2989d27bd46eede4ea54cb41e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -89,6 +89,16 @@ }, "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India" }, + { + "type": "Title", + "element_id": "3d71760ba4f1cc95873ee36178f97d82", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 1 + }, + "text": "ARTICLE INFO" + }, { "type": "NarrativeText", "element_id": "fbd221e3c1f82c8601661213b98b0962", @@ -110,8 +120,8 @@ "text": "a b s t r a c t" }, { - "type": "UncategorizedText", - "element_id": "ed0a4666ce85e6310a0984f37e0e98f8", + "type": "NarrativeText", + "element_id": "32133fc9f028473fb3d3d2ca24382c28", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -150,14 +160,14 @@ "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India." }, { - "type": "Title", - "element_id": "5810d7d862f5f5d65e257a3ed9b102ac", + "type": "ListItem", + "element_id": "7373e1d1cb305b02bf37dc138ba774c4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni)." + "text": "Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni)." }, { "type": "NarrativeText", @@ -171,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "0a1b09ff562f4d063703cbf021ee297f", + "element_id": "e326e74f4607af7d370e049bc5d9e66a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -180,8 +190,8 @@ "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484–487" }, { - "type": "UncategorizedText", - "element_id": "5844a72aee9269a68da28cae55c706d8", + "type": "Header", + "element_id": "28b33efedc139452525a280e548c029b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -191,7 +201,7 @@ }, { "type": "Title", - "element_id": "5af2c5326780fc58a48ca40c6b47bee5", + "element_id": "39826c423283dfd91f1dbd34664ce038", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -230,8 +240,18 @@ "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired" }, { - "type": "ListItem", - "element_id": "b97bb84430abd87625f9a82f95423073", + "type": "Table", + "element_id": "765958cb90f3061bda61fe2f973b2acb", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a C++ program on Intel\" Xeon” CPU E5- 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457-487 [3]." + }, + { + "type": "NarrativeText", + "element_id": "eed804f27c782a8a3643b5d5379099d4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -241,7 +261,7 @@ }, { "type": "Title", - "element_id": "596eda178f8c5adefbae7cfe1bec78c3", + "element_id": "e63f0ed399f0537c9ffeadfcae3baed6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -249,45 +269,75 @@ }, "text": "Value of the data" }, + { + "type": "NarrativeText", + "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" + }, { "type": "ListItem", - "element_id": "510d0bce379a0d3ba5ff46d536bdb7c5", + "element_id": "407d8a9e0bef6d906ec672c5b59a787f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "© The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the performance of the algorithms for the MDVSP. © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations. e All the problem instances are available for use without any restrictions. e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison. © The dataset includes a program that can generate similar problem instances of different sizes." + "text": "The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate performance of the algorithms for the MDVSP." }, { - "type": "NarrativeText", - "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", + "type": "ListItem", + "element_id": "aaedb0d8a48db639a022b216035c56de", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" + "text": "© The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations." }, { - "type": "Title", - "element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa", + "type": "NarrativeText", + "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "1. Data" + "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" }, { "type": "ListItem", - "element_id": "86e53159056da85c215281a9c68d46b9", + "element_id": "5d3c15437243e1c067415182c2314622", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "The benchmark solutions and solution time for the problem instances are presented in [3] and be used for the comparison." + }, + { + "type": "NarrativeText", + "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "For each problem instance, the following information is provided: The number of depots (m), The number of trips (n), The number of locations (I), The number of vehicles at each depot, For each tripie 1,2,...,n,a start time, ft}, an end time, ff, a start location, i, and an end location, i, and" + "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." + }, + { + "type": "Title", + "element_id": "1c3f3de4e65aae5bd147f84779712a65", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "1. Data" }, { "type": "NarrativeText", @@ -301,6 +351,36 @@ }, { "type": "NarrativeText", + "element_id": "d1e8a672b8efb9e58dcf4a40204c1687", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "For each tripie 1,2,...,n,a start time, ft}, an end time, ff, a start location, i, and an end location, i, and" + }, + { + "type": "NarrativeText", + "element_id": "33d26eae1edf215a9677101c7147d671", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip i A 1; 2; …; n, a start time, ts" + }, + { + "type": "UncategorizedText", + "element_id": "c6490fc185478150e7816c45ef8a48d5", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Þ," + }, + { + "type": "ListItem", "element_id": "dcb60b2d7218e86946c2235aad0b6008", "metadata": { "data_source": {}, @@ -320,8 +400,8 @@ "text": "All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start" }, { - "type": "UncategorizedText", - "element_id": "86b700fab5db37977a73700b53a0654b", + "type": "Header", + "element_id": "8d0736d21edd4e194e5db02347e129c7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -330,8 +410,8 @@ "text": "486" }, { - "type": "NarrativeText", - "element_id": "0a1b09ff562f4d063703cbf021ee297f", + "type": "Header", + "element_id": "e326e74f4607af7d370e049bc5d9e66a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -341,7 +421,7 @@ }, { "type": "NarrativeText", - "element_id": "ab861dc146a84a52e48a75be2ba3f190", + "element_id": "9f77f0db3a785a5bb491fb79fe54cfa0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -359,16 +439,6 @@ }, "text": "j , the vehicle must travel empty from le j (cid:3)te i Þ. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" }, - { - "type": "NarrativeText", - "element_id": "a18dff87ecdbfa5d5d8a1ed56f7ce734", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "A trip j can be covered after trip i by the same vehicle, if ts j" - }, { "type": "ListItem", "element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8", @@ -380,14 +450,14 @@ "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at" }, { - "type": "Title", - "element_id": "e46a5a30f05d06e82d8b7d10448de683", + "type": "ListItem", + "element_id": "3f2b8351a07eef2caa1918b4b21d05af", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "the depot." + "text": "The number of schedules that start from a depot should not exceed the number of vehicles the depot." }, { "type": "NarrativeText", @@ -401,23 +471,23 @@ }, { "type": "NarrativeText", - "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "element_id": "149eebcec86a1b9a43b93af13952870b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." + "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" }, { "type": "NarrativeText", - "element_id": "92b491d0e108ec13f263b16646ecac65", + "element_id": "e731dc92fddc0512e142bfb2bed62bbf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots (m), the number of trips, (n), and the number of locations (I), in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, ie{1,...,n}, and provides the start location, the start time, the end location, and the end time of trip i. The next | lines present the travel times between any two locations, i,j e {1, wal}. The dataset also includes a program ‘Generatelnstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots (m), the number of trips (n), and the" + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." }, { "type": "UncategorizedText", @@ -550,8 +620,8 @@ "text": "Possible empty travels" }, { - "type": "NarrativeText", - "element_id": "0a1b09ff562f4d063703cbf021ee297f", + "type": "Header", + "element_id": "e326e74f4607af7d370e049bc5d9e66a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -560,8 +630,8 @@ "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484–487" }, { - "type": "UncategorizedText", - "element_id": "9b19f9ab816598a0809e4afd5d60800f", + "type": "Header", + "element_id": "dd1252fa6e5f6c3f43669c9cc95952e7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -580,34 +650,44 @@ "text": "Table 2 Description of file format for each problem instance." }, { - "type": "NarrativeText", - "element_id": "444f48f6d4f0ee6d3a04b7bf76218980", + "type": "UncategorizedText", + "element_id": "05f82fa1685502a356c0894aa45b404d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Number of Number of columns in Description lines each line" + "text": "1 1 n" }, { - "type": "UncategorizedText", - "element_id": "05f82fa1685502a356c0894aa45b404d", + "type": "Title", + "element_id": "acac86c0e609ca906f632b0e2dacccb2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "1 1 n" + "text": "l" }, { "type": "Title", - "element_id": "acac86c0e609ca906f632b0e2dacccb2", + "element_id": "151e509ce97fe40eecae3822c78adcf5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "l" + "text": "Number of lines" + }, + { + "type": "Table", + "element_id": "e33daf2e73d705ed4b27cd4e8fee5f5f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "Number of Number of columns in Description lines each line 1 3 The number of depots, the number of trips, and the number of locations. 1 m The number of vehicles rg at each depot d. n 4 One line for each trip, i= 1,2, ...,n. Each line provides the start location [?, the start time ¢%, the end location [F and the end time ¢¢ for the corresponding trip. I I Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." }, { "type": "UncategorizedText", @@ -630,8 +710,28 @@ "text": "l" }, { - "type": "ListItem", - "element_id": "f096a8499e50cac1f45ceb8340dace5a", + "type": "Title", + "element_id": "0d42fdb9458af19413eee0a1227f415c", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "Number of columns in each line" + }, + { + "type": "Title", + "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "Description" + }, + { + "type": "NarrativeText", + "element_id": "d2660f6e66916959c5de8a941bfa89c7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -641,7 +741,27 @@ }, { "type": "Title", - "element_id": "764eef872135149aaf95224bab69c844", + "element_id": "8ee69286d5f681913dbfdeb60bedc572", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "i , the end location le" + }, + { + "type": "Title", + "element_id": "08238905e7bba7115b7d7d58fef13ec6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "i , the start" + }, + { + "type": "Title", + "element_id": "5b0294965f25f778012e27476e7ec042", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -670,7 +790,7 @@ "text": "Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3]." }, { - "type": "NarrativeText", + "type": "Title", "element_id": "81db7fab0806640b0cbbac862671704f", "metadata": { "data_source": {}, @@ -681,42 +801,72 @@ }, { "type": "NarrativeText", - "element_id": "d434a0e19d0d34e92936b9566e1ebb45", + "element_id": "8f0264ba00616d29c2648dc51f24b439", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Transparency document associated with this article can be found in the online version at https://doi." + "text": "Transparency document associated with this article can be found in the online version at https://doi. org/10.1016/j.dib.2018.12.055." }, { - "type": "UncategorizedText", - "element_id": "fa783fbedd3cbd108b99d04da7fb7e8b", + "type": "Title", + "element_id": "e56261e0bd30965b8e68ed2abb15b141", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "org/10.1016/j.dib.2018.12.055." + "text": "References" }, { - "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "type": "ListItem", + "element_id": "6e1b1affc6fddc7c465dff0416c8a234", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "References" + "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling problem, Networks 19 (5) (1989) 531-548." }, { - "type": "NarrativeText", - "element_id": "ba0af0b44e7cc27de119a1771c07dfc2", + "type": "ListItem", + "element_id": "be401eb5b247632c2f3966e4c37dd8ae", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time-space network based exact optimization model for multi-depot bus scheduling, Eur. J. Oper. Res. 175 (3) (2006) 1616-1627." + }, + { + "type": "ListItem", + "element_id": "dd8920331ab639dbe3fd39605c0d583f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457-487." + }, + { + "type": "ListItem", + "element_id": "33edf93e6f8900c4bccbff43de487158", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling problem, J. Sched. 12 (1) (2009) 17." + }, + { + "type": "ListItem", + "element_id": "ec1963edde66d2c57c5ff9f05b5829c8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling problem, Networks 19 (5) (1989) 531-548. [2] N. Kliewer, T. Mellouli, L. Suhl, A time-space network based exact optimization model for multi-depot bus scheduling, Eur. J. Oper. Res. 175 (3) (2006) 1616-1627. [3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457-487. [4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling problem, J. Sched. 12 (1) (2009) 17. [5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) (1994) 41-52." + "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) (1994) 41-52." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 24ce361e7b..ea0709f203 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -1,7 +1,7 @@ [ { - "type": "UncategorizedText", - "element_id": "055b9fd1463ee2c4481b4eb9e20d4b0f", + "type": "Header", + "element_id": "cda1ae2f061dbdafb3374e6411d3823e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -10,8 +10,8 @@ "text": "S32" }, { - "type": "Title", - "element_id": "b8b976f4707d2af116239c70acf8f2be", + "type": "Header", + "element_id": "d7106f2241a37dc4e61314f45da1ff5b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "NarrativeText", - "element_id": "d16d8a1280ba2acf52f98e9d3c9c2301", + "element_id": "f7573da2765829e5fcbc8eed02057106", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -40,7 +40,7 @@ "text": "Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ." }, { - "type": "Title", + "type": "NarrativeText", "element_id": "c02ccab64d2a356a96f5394a2b92fa0b", "metadata": { "data_source": {}, @@ -91,7 +91,7 @@ }, { "type": "NarrativeText", - "element_id": "6164e852cb79f9408e833e350240ac5c", + "element_id": "9e7cc386b1093b082bccf936861747aa", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -100,7 +100,7 @@ "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design." }, { - "type": "Title", + "type": "NarrativeText", "element_id": "80abb04ec613b1d325ce6b8d0bb3349d", "metadata": { "data_source": {}, @@ -120,8 +120,8 @@ "text": "Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2," }, { - "type": "Title", - "element_id": "3aa954bd1e29835edef83b7cd04e9769", + "type": "NarrativeText", + "element_id": "117f7774fd093a60d964cc5b461f3e22", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -131,7 +131,7 @@ }, { "type": "Title", - "element_id": "574d62523bf0c0a56967c26c82840550", + "element_id": "44b59a545030365cd1ad225ed05ff22d", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 99b11a3a14..a05bf96dd0 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -30,27 +30,27 @@ "text": "1 2 0 2" }, { - "type": "Title", - "element_id": "2e26dc2c4d8d6e4e53865d5697d3a983", + "type": "Header", + "element_id": "f03c6d91abe08ae952f1122ce62bb508", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "n u J" + "text": "2103.15348v2 [cs.CV] 21 Jun" }, { "type": "UncategorizedText", - "element_id": "f71998fe363b9c29116c80b5eecf33a2", + "element_id": "2bc84f0cc92df12c750ef7cc180fa144", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "1 2" + "text": "2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a" }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd", "metadata": { "data_source": {}, @@ -60,154 +60,154 @@ "text": "1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca" }, { - "type": "UncategorizedText", - "element_id": "cfae0d4248f7142f7b17f826cd7a5192", + "type": "NarrativeText", + "element_id": "be90d2640470e975e3402d19ba2c66cf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "]" + "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io." }, { - "type": "Title", - "element_id": "19d05c4115a6b94b3b470e7c10e29698", + "type": "NarrativeText", + "element_id": "e66a3d2b6c9a872c53e226d8e0cc0a0e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "V C . s c [" + "text": "Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit." }, { - "type": "UncategorizedText", - "element_id": "2bc84f0cc92df12c750ef7cc180fa144", + "type": "Title", + "element_id": "3fa53fc0dab8ef96d05d8fd4c7e41b49", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "2 v 8 4 3 5 1 . 3 0 1 2 : v i X r a" + "text": "Introduction" }, { "type": "NarrativeText", - "element_id": "be90d2640470e975e3402d19ba2c66cf", + "element_id": "bca638b88125eed8a8003e46a6055618", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io." + "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11," }, { - "type": "NarrativeText", - "element_id": "e66a3d2b6c9a872c53e226d8e0cc0a0e", + "type": "Title", + "element_id": "0119810584ee0b01e4d14dfd8c250bf2", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit." + "text": "2 Z. Shen et al." }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "NarrativeText", + "element_id": "82d5520be5fd847464727f56151d316c", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "1" + "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects." }, { - "type": "Title", - "element_id": "3fa53fc0dab8ef96d05d8fd4c7e41b49", + "type": "NarrativeText", + "element_id": "836e6ef5cecc9a73356c0d5bee181829", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Introduction" + "text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:" }, { "type": "NarrativeText", - "element_id": "bca638b88125eed8a8003e46a6055618", + "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11," + "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel." }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "NarrativeText", + "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "2" + "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." }, { - "type": "Title", - "element_id": "22364b7a1d2b35282b360d61ae08e2b9", + "type": "ListItem", + "element_id": "18b1855acfb386ae6e6a253da566e93b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Z. Shen et al." + "text": "4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)" }, { - "type": "NarrativeText", - "element_id": "82d5520be5fd847464727f56151d316c", + "type": "ListItem", + "element_id": "22b127e6d05ce12ad9b9170909c64bbc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects." + "text": "1. An off-the-shelf toolkit for applying DL models for recognition, and other DIA tasks (Section Bp ayout det ection, character" }, { - "type": "NarrativeText", - "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", + "type": "ListItem", + "element_id": "569ce8891b02bc38f50a0cde0039e951", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel." + "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that" }, { - "type": "NarrativeText", - "element_id": "836e6ef5cecc9a73356c0d5bee181829", + "type": "ListItem", + "element_id": "18dcbc2839f9783d2c91cbce75d3e685", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:" + "text": "3. Comprehensive tools for efficient document image data annotation and model" }, { "type": "ListItem", - "element_id": "dc2c331204369d29f5bdcd8dc88a8174", + "element_id": "e4b1d076c9e9c84a45bd11fcf816bddf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "1. An off-the-shelf toolkit for applying DL models for recognition, and other DIA tasks (Section Bp ayout det ection, character 2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage 3. Comprehensive tools for efficient document image tuning to support different levels of customization 4. A DL model hub and community platform for t tion, and discussion of DIA models and pipeline: reproducibility, and extensibility (Section [4) ne easy S. ata annotation and model haring, distribu- s, to promote reusability," + "text": "Comprehensive tools for efficient document image tuning to support different levels of customization ata annotation and model" }, { - "type": "NarrativeText", - "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", + "type": "ListItem", + "element_id": "90deab7b4ea81483c3431cebb1621c61", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." + "text": "A rich repository of pre-trained neural network models (Model Zoo) underlies the off-the-shelf usage" }, { "type": "NarrativeText", @@ -220,8 +220,8 @@ "text": "LayoutParser is well aligned with recent efforts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects" }, { - "type": "Title", - "element_id": "69c327f77af9a7259f0febf0dffa7e1a", + "type": "Header", + "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -229,16 +229,6 @@ }, "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, - { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "3" - }, { "type": "NarrativeText", "element_id": "74a7758f83612467af8eea9d20e4a6f7", @@ -300,8 +290,8 @@ "text": "Recent years have also seen numerous efforts to create libraries for promoting reproducibility and reusability in the field of DL. Libraries like Dectectron2 [35]," }, { - "type": "ListItem", - "element_id": "bbde5bc98ffe50bc4557c848cb1a0473", + "type": "NarrativeText", + "element_id": "77ddbbd89513c49479cd4dad3261d07d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -310,34 +300,24 @@ "text": "6 The number shown is obtained by specifying the search type as ‘code’. 7 https://ocr-d.de/en/about 8 https://github.com/BobLd/DocumentLayoutAnalysis 9 https://github.com/leonlulu/DeepLayout 10 https://github.com/hpanwar08/detectron2 11 https://github.com/JaidedAI/EasyOCR 12 https://github.com/PaddlePaddle/PaddleOCR" }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "4" - }, - { - "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "type": "ListItem", + "element_id": "90b6d90b1496cbc35cb08e310e03d063", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Z. Shen et al." + "text": "Shen et al. ~ N n" }, { - "type": "FigureCaption", + "type": "Image", "element_id": "812dcaaec927a84d57af36e20adb5ded", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY" + "text": " Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY " }, { "type": "NarrativeText", @@ -390,7 +370,7 @@ "text": "At the core of LayoutParser is an off-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered" }, { - "type": "NarrativeText", + "type": "Header", "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, @@ -400,34 +380,34 @@ "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "FigureCaption", + "element_id": "b51f99cb953082a922ba43c09d4492b3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "5" + "text": "Table 1: Current layout detection models in the LayoutParser model zoo" }, { "type": "NarrativeText", - "element_id": "b51f99cb953082a922ba43c09d4492b3", + "element_id": "ec22445e13875ab6bbce602dd7f07c99", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "Table 1: Current layout detection models in the LayoutParser model zoo" + "text": "PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]" }, { - "type": "NarrativeText", - "element_id": "ec22445e13875ab6bbce602dd7f07c99", + "type": "Title", + "element_id": "4411e525721e7dd801755882fd2361b2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]" + "text": "Dataset" }, { "type": "Table", @@ -436,20 +416,10 @@ "data_source": {}, "filetype": "application/pdf", "page_number": 5, - "text_as_html": "
    Dataset| Base Model'|Large Model| Notes
    PubLayNet B8]|F/MMLayouts of modern scientific documents
    M-Layouts of scanned modern magazines and scientific reports
    F-Layouts of scanned US newspapers from the 20th century
    TableBankFFnd business document. Table region on modern scientific
    HJDatasetF/M-Layouts of history Japanese documents
    " + "text_as_html": "
    Dataset| Base Model'| Large Model| Notes
    PubLayNet B8]|F/MMLayouts of modern scientific documents
    PRImAM-Layouts of scanned modern magazines and scientific reports
    NewspaperF-Layouts of scanned US newspapers from the 20th century
    TableBankFFTable region on modern scientific and business document
    HJDatasetF/M-Layouts of history Japanese documents
    " }, "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" }, - { - "type": "Title", - "element_id": "4411e525721e7dd801755882fd2361b2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Dataset" - }, { "type": "Title", "element_id": "e204034a86be67f09ca103677799d7af", @@ -491,7 +461,7 @@ "text": "Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents" }, { - "type": "NarrativeText", + "type": "Footer", "element_id": "c24bcb2cf98d6226bd805b6f99d3b61a", "metadata": { "data_source": {}, @@ -512,26 +482,26 @@ }, { "type": "NarrativeText", - "element_id": "11dff8778699e76422be6b86c9eaa62a", + "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" + "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." }, { "type": "NarrativeText", - "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", + "element_id": "11dff8778699e76422be6b86c9eaa62a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." + "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:" }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "e416e69991bf6a4b338df18ebdb6e712", "metadata": { "data_source": {}, @@ -550,25 +520,15 @@ }, "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering different languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are significantly different from the training dataset. As document structures and layouts vary greatly in different domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///." }, - { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "6" - }, { "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "element_id": "5c44994a44f74b706d8a5e74cd753a8b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "Z. Shen et al." + "text": "6 Z. Shen et al." }, { "type": "Image", @@ -590,16 +550,6 @@ }, "text": "3.2 Layout Data Structures" }, - { - "type": "NarrativeText", - "element_id": "cafae07120d714f0822e89865adf62da", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility." - }, { "type": "NarrativeText", "element_id": "7461d30ee7c51c91bca8003792d43bfe", @@ -621,24 +571,24 @@ "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to efficiently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the final outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide different levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes." }, { - "type": "Title", - "element_id": "69c327f77af9a7259f0febf0dffa7e1a", + "type": "FigureCaption", + "element_id": "cafae07120d714f0822e89865adf62da", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 7 + "page_number": 6 }, - "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" + "text": "Fig. 2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility." }, { - "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "type": "Title", + "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "7" + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, { "type": "Title", @@ -652,36 +602,36 @@ }, { "type": "NarrativeText", - "element_id": "e284bd66511cfa064681253e7ac57a9a", + "element_id": "f2a3e5fbb983d9132dddecc381ed6e0b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" + "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13." }, { "type": "NarrativeText", - "element_id": "eec800eef6e395c21feacd729868dd18", + "element_id": "e284bd66511cfa064681253e7ac57a9a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort." + "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:" }, { "type": "NarrativeText", - "element_id": "f2a3e5fbb983d9132dddecc381ed6e0b", + "element_id": "eec800eef6e395c21feacd729868dd18", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13." + "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort." }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "55ab2654fa8c2c01de322b52f4fad508", "metadata": { "data_source": {}, @@ -712,7 +662,7 @@ }, { "type": "NarrativeText", - "element_id": "8bcb4c948fda07d2fdbf7d582983b93e", + "element_id": "9a44827ec5ebbf51ad441ff9927c6e83", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -721,84 +671,84 @@ "text": "13 This is also available in the LayoutParser documentation pages." }, { - "type": "UncategorizedText", - "element_id": "2c624232cdd221771294dfbb310aca00", + "type": "ListItem", + "element_id": "3993b330c2b3b86513c3edbcd33afc91", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "8" + "text": "Z. Shen et al." }, { "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "element_id": "6727ba436ddf5e47087d005ded6c049f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Z. Shen et al." + "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout." }, { - "type": "NarrativeText", - "element_id": "6727ba436ddf5e47087d005ded6c049f", + "type": "Title", + "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout." + "text": "Operation Name" }, { "type": "Title", - "element_id": "abf4059c5c98ff5bbd0dde9f8c2b7c75", + "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "Operation Name" + "text": "Description" }, { "type": "Title", - "element_id": "505791f52a5741b58f5dd02836da7b31", + "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.union(block2)" + "text": "block.scale(fx, fy)" }, { "type": "Title", - "element_id": "acfa5090fbb8986000a92d84d41d8140", + "element_id": "1c1464d6a8f85d78202f67293ee7ac42", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block1.is in(block2)" + "text": "block.shift(dx, dy)" }, { "type": "Title", - "element_id": "8dcb74f5ee2eabd0d8e966d46bcdf3be", + "element_id": "acfa5090fbb8986000a92d84d41d8140", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block.scale(fx, fy)" + "text": "block1.is in(block2)" }, { "type": "Title", - "element_id": "1c1464d6a8f85d78202f67293ee7ac42", + "element_id": "505791f52a5741b58f5dd02836da7b31", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8 }, - "text": "block.shift(dx, dy)" + "text": "block1.union(block2)" }, { "type": "Title", @@ -832,14 +782,14 @@ }, { "type": "Table", - "element_id": "f81d4915b54758e0d4d52af3566bb813", + "element_id": "f73e2a20abbf1180916a4b29b15e3b32", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, - "text_as_html": "
    Operation NameDescription
    block.pad(top, bottom,right,left)| Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio ; in x and y direction
    . block.shift(dx, dy)Move the current block with the shift : : a distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    . block1. intersect (block2)Return the intersection region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
    . block1.union(block2)Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
    . block1.relative_to(block2)Convert the absolute coordinates of block to ' ' relative coordinates to block2
    . block1.condition_on(block2)Calculate the absolute coordinates of blockl given . the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    " + "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs.
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs.
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2) block. crop_image (image)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates Obtain the image segments in the block region
    " }, - "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region" + "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. (image) Obtain the in the block" }, { "type": "NarrativeText", @@ -851,16 +801,6 @@ }, "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input" }, - { - "type": "Title", - "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Description" - }, { "type": "NarrativeText", "element_id": "6bd7ba22b5bc477ef4c291a10f4745bc", @@ -1002,7 +942,7 @@ "text": "14 https://altoxml.github.io" }, { - "type": "NarrativeText", + "type": "ListItem", "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, @@ -1011,16 +951,6 @@ }, "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, - { - "type": "UncategorizedText", - "element_id": "19581e27de7ced00ff1ce50b2047e7a5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "9" - }, { "type": "Image", "element_id": "6df6057f894a166cf24fd34f64267f09", @@ -1072,24 +1002,14 @@ "text": "After the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are significantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets." }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "10" - }, - { - "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "type": "ListItem", + "element_id": "9bf176adca2cfa747e7f0255bfc3594a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "Z. Shen et al." + "text": "10 Z. Shen et al." }, { "type": "Image", @@ -1163,7 +1083,7 @@ }, { "type": "Title", - "element_id": "69c327f77af9a7259f0febf0dffa7e1a", + "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1171,16 +1091,6 @@ }, "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, - { - "type": "UncategorizedText", - "element_id": "4fc82b26aecb47d2868c4efbe3581732", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 11 - }, - "text": "11" - }, { "type": "NarrativeText", "element_id": "5cdbcea58a81d8f7de9a4fa841107be1", @@ -1203,7 +1113,7 @@ }, { "type": "NarrativeText", - "element_id": "59e46c1089fd1f2c58bba66545420ad6", + "element_id": "fa19ab2536cbbb48c09de29fdebd52bd", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1233,7 +1143,7 @@ }, { "type": "NarrativeText", - "element_id": "39ed00ce33ad04a4542357a1f912aed8", + "element_id": "3cbd8234ac0c6d29feb24e6202144aa8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1252,28 +1162,18 @@ "text": "& document page consists of eight rows like this. For simplicity we skip the row segmentation discussion and refer readers to the source code when available." }, { - "type": "UncategorizedText", - "element_id": "6b51d431df5d7f141cbececcf79edf3d", + "type": "Title", + "element_id": "de2a222ad7b9cf1e5e5432f53c15996d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "12" - }, - { - "type": "Title", - "element_id": "22364b7a1d2b35282b360d61ae08e2b9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "Z. Shen et al." + "text": "12 Z. Shen et al." }, { "type": "NarrativeText", - "element_id": "164904dc2ff256763b3e64f1b56a784e", + "element_id": "7174760d4c8d9b7b13da3918015312dc", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1321,35 +1221,45 @@ }, "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." }, + { + "type": "NarrativeText", + "element_id": "d11adbfd88959ce24fbfdc7f8155e777", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 12 + }, + "text": "16 This measures the overlap between the detected and ground-truth characters, and" + }, { "type": "ListItem", - "element_id": "122f0a4bde97c6e10e95c6e54479e34e", + "element_id": "e67f07837a2a4c207b21a168c4f0aa6c", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1. '7 This measures the number of edits from the ground-truth text to the predicted text, and lower is better." + "text": "This measures the overlap between the detected and ground-truth characters, the maximum is 1." }, { - "type": "Title", - "element_id": "69c327f77af9a7259f0febf0dffa7e1a", + "type": "ListItem", + "element_id": "f06c47bb49334c82c636ac2d1fe9ec4e", "metadata": { "data_source": {}, "filetype": "application/pdf", - "page_number": 13 + "page_number": 12 }, - "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" + "text": "'7 This measures the number of edits from the ground-truth text to the predicted text, and lower is better." }, { - "type": "UncategorizedText", - "element_id": "3fdba35f04dc8c462986c992bcf87554", + "type": "ListItem", + "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 13 }, - "text": "13" + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, { "type": "Image", @@ -1362,7 +1272,7 @@ "text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" }, { - "type": "NarrativeText", + "type": "FigureCaption", "element_id": "1a2b9e59d53ac38ee6affb3ffcda6b8c", "metadata": { "data_source": {}, @@ -1373,7 +1283,7 @@ }, { "type": "Title", - "element_id": "555b11646d1541685d37f9a18773dd74", + "element_id": "76c98240da7b06b4b3fcf8109edbbaba", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1412,28 +1322,18 @@ "text": "18 https://github.com/atlanhq/camelot, https://github.com/tabulapdf/tabula" }, { - "type": "UncategorizedText", - "element_id": "8527a891e224136950ff32ca212b45bc", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "14" - }, - { - "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "type": "ListItem", + "element_id": "91e724833d5794abbd5fd6ad6c54aa9f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "Z. Shen et al." + "text": "14 Z. Shen et al." }, { "type": "Title", - "element_id": "35f7d23fd70cfc85a80573db030804ad", + "element_id": "a2a71736439cbc5e1445bddd40712b9b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1473,17 +1373,77 @@ }, { "type": "ListItem", - "element_id": "af2a971baba0e022d1e53fc0e44b1d94", + "element_id": "f7e8d95a8f2b84a4461e037b0a7b9704", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, ot G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Mané, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Viégas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), software available from tensorflow.org Alberti, M., Pondenkandath, V., Wiirsch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423-428. IEEE (2018) Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296-300. IEEE (2009) Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365-9374 (2019) Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale Hierarchical Image Database. In: CVPRO9 (2009) Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980-989. PMLR (2017) Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180-1189. PMLR (2015)" + "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man´e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org" }, { - "type": "Title", - "element_id": "69c327f77af9a7259f0febf0dffa7e1a", + "type": "ListItem", + "element_id": "24862433f743a0910da62ec3fb4f537c", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "[2] Alberti, M., Pondenkandath, V., W¨ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)" + }, + { + "type": "ListItem", + "element_id": "79a1f55a3945eb6304697ec72847ed35", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)" + }, + { + "type": "ListItem", + "element_id": "cafb24e03d3f74ce81ba82312af7bfc2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)" + }, + { + "type": "ListItem", + "element_id": "49df59253e226989981b7fc9628ecd40", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "ot Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale Hierarchical Image Database. In: CVPRO9 (2009)" + }, + { + "type": "ListItem", + "element_id": "b000578a41ffcc554faac04609d2f4e1", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" + }, + { + "type": "ListItem", + "element_id": "c6e835fe03323406543926cc0f5a94de", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 14 + }, + "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" + }, + { + "type": "ListItem", + "element_id": "4c2478cf439baab6ace34761eda527d9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1492,53 +1452,313 @@ "text": "LayoutParser: A Unified Toolkit for DL-Based DIA" }, { - "type": "UncategorizedText", - "element_id": "e629fa6598d732768f7c726b4b621285", + "type": "NarrativeText", + "element_id": "068bf90a7743f50c4a00d4827035e42f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" + }, + { + "type": "ListItem", + "element_id": "c8f5863d94cc9b9d77f153c6d1b0015a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" + }, + { + "type": "ListItem", + "element_id": "60fbf9d2525b5a22588082da96a41ff8", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961-2969 (2017)" + }, + { + "type": "NarrativeText", + "element_id": "2f103adde52e35a8853cbb476720a6ef", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" + }, + { + "type": "ListItem", + "element_id": "7ceaba2290e3f9c5f3754032ce4d5663", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" + }, + { + "type": "ListItem", + "element_id": "a772a029ff3b22f4dca5f7df3fe1897b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "15" + "text": "Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J. 2007(159), 2 (Jul 2007)" }, { "type": "ListItem", - "element_id": "ab02ce354f7464ee1d53d58faa93745f", + "element_id": "1f1a0fac1bae95f076ea34c955551632", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "17 18 19 20 Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) Lukasz Garncarek, Powalski, R., Stanistawek, T., Topolski, B., Halama, P., Graliriski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020) Graves, A., Fernandez, $., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369-376 (2006) Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991-995. IEEE (2015) He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961-2969 (2017) He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770-778 (2016) Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J. 2007(159), 2 (Jul 2007) Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42-47. IEEE (2011) Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120-122. UIST 20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https: //doi.org/10.1145/3379350.3416143 Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055-3062. Association for Computing Machinery, New York, NY, USA (2020), Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019) Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740-755. Springer (2014) Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431-3440 (2015) Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, $., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161-168 (2011) Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7-12. IEEE (2018)" + "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" + }, + { + "type": "ListItem", + "element_id": "0aabfb2a8e358618179ec2e1d322e519", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" + }, + { + "type": "ListItem", + "element_id": "df18427a8013b4df36e8ac4e2ee5da3a", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" + }, + { + "type": "ListItem", + "element_id": "257e7b8aef89c41e03bf837ea517885e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" + }, + { + "type": "ListItem", + "element_id": "00c7abdd98fedd1746994d16ca44d45f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" + }, + { + "type": "ListItem", + "element_id": "7a0afd734c99f6b076dc58b2e57cfec6", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" + }, + { + "type": "ListItem", + "element_id": "00d6ff1b3fb21f8a608f3b6269df56be", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" + }, + { + "type": "ListItem", + "element_id": "deecdfacbce71dd1425fd54010b2fad1", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 15 + }, + "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" + }, + { + "type": "ListItem", + "element_id": "5c44994a44f74b706d8a5e74cd753a8b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "6 Z. Shen et al." + }, + { + "type": "ListItem", + "element_id": "c9d8f6434425015c72f94fb212bba28f", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)" + }, + { + "type": "ListItem", + "element_id": "9c3e13a0e9738b846289bff06952da3b", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)" + }, + { + "type": "ListItem", + "element_id": "bd680d8baa57cc15337de2e0c299d121", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)" }, { "type": "UncategorizedText", - "element_id": "b17ef6d19c7a5b1ee83b907c595526dc", + "element_id": "b66713d3f2d1689f9174e1cb87429eed", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "16" + "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning" + }, + { + "type": "UncategorizedText", + "element_id": "10a3ff59f6157f21733e659a41031f83", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" }, { "type": "NarrativeText", - "element_id": "3993b330c2b3b86513c3edbcd33afc91", + "element_id": "219033258f3fff3de33bed379610c8f3", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "Z. Shen et al." + "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)" + }, + { + "type": "ListItem", + "element_id": "21d151e4c182a1f441c3486d2f79afc0", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)" + }, + { + "type": "NarrativeText", + "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)" + }, + { + "type": "ListItem", + "element_id": "4c8ddc159ec208bb7f454603fcd7c4bd", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)" + }, + { + "type": "NarrativeText", + "element_id": "385c241b43ef196663b8d30a6b8768ed", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://" + }, + { + "type": "ListItem", + "element_id": "6c94dd219ce339c358163833e20d099e", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" + }, + { + "type": "ListItem", + "element_id": "94ce48002d0ae80dc04f26a5dd2e8f11", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of text and layout for document image understanding (2019)" + }, + { + "type": "ListItem", + "element_id": "5657166191992144b2b06f2bd05ffabf", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "github. com/facebookresearch/detectron2) (2019) Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2." + }, + { + "type": "ListItem", + "element_id": "c1780f7a01a76540c5eb5cecf1a2270d", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" + }, + { + "type": "Title", + "element_id": "21d399ba787aabbf69a8ca861cbcc4a3", + "metadata": { + "data_source": {}, + "filetype": "application/pdf", + "page_number": 16 + }, + "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" }, { "type": "ListItem", - "element_id": "993f472d953f5d0e4054f1d4ad6fc4f0", + "element_id": "435e423f8ca655521a6fe38e8e0a3e1d", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "23 github. com/facebookresearch/detectron2) (2019) Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257-260. IEEE (2010) Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572-573 (2020) Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142-147. IEEE (2019) Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91-99 (2015) Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61-80 (2008) Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548-549 (2020) Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning based layout annotation. arXiv preprint arXiv:2010.01762 (2020) Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720-725. IEEE (2019) Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020) Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of text and layout for document image understanding (2019) Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (Sep 2019). https: //doi.org/10.1109/ICDAR.2019.00166" + "text": "Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (Sep 2019). https: //doi.org/10.1109/ICDAR.2019.00166" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json b/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json new file mode 100644 index 0000000000..cffc024be5 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "94efbf7307081f8f45b11a183ad99254", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Mission, Vision, Values" + }, + { + "type": "NarrativeText", + "element_id": "f116dc480f737022b3eef55d2095d808", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index a8cc14e267..6fd331f40f 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -36,7 +36,7 @@ "text": "WORLD ECONOMIC OUTLOOK UPDATE Inflation Peaking amid Low Growth" }, { - "type": "UncategorizedText", + "type": "Title", "element_id": "98e636ffa4ea25e037f659685a56f41d", "metadata": { "data_source": { @@ -91,7 +91,7 @@ }, { "type": "ListItem", - "element_id": "4f0cdff19ccd9010b64eff87ced8e0b7", + "element_id": "9fe27138e05d3a42d1e5cc57bc1fbc54", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -105,11 +105,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "© Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000-19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way Jor a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017-19) levels of about 3.5 percent. © = The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress. © In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." + "text": " Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." }, { - "type": "Title", - "element_id": "0953470500eb215048fd49263b8829a4", + "type": "ListItem", + "element_id": "56b3c7e61958b8308bb1ab927b6cdc2c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -123,11 +123,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Forces Shaping the Outlook" + "text": "© = The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." }, { - "type": "NarrativeText", - "element_id": "bb50ad035681bfb501e33a52abe173ad", + "type": "ListItem", + "element_id": "cdf520693b6ec6dc4877bc4aedea746c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -141,11 +141,29 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "The global fight against inflation, Russia’s war in Ukraine, and a resurgence of COVID-19 in China weighed on global economic activity in 2022, and the first two factors will continue to do so in 2023." + "text": "tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." + }, + { + "type": "Title", + "element_id": "0953470500eb215048fd49263b8829a4", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 2 + }, + "text": "Forces Shaping the Outlook" }, { "type": "NarrativeText", - "element_id": "041668dbcf5b0c4114acae7ef393f5cd", + "element_id": "bb50ad035681bfb501e33a52abe173ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -159,11 +177,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Despite these headwinds, real GDP was surprisingly strong in the third quarter of 2022 in numerous economies, including the United States, the euro area, and major emerging market and developing economies. The sources of these surprises were in many cases domestic: stronger-than-expected private consumption and investment amid tight labor markets and greater-than-anticipated fiscal support. Households spent more to satisfy pent-up demand, particularly on services, partly by drawing down their stock of savings as economies reopened. Business investment rose to meet demand. On the supply side, easing bottlenecks and declining transportation costs reduced pressures on input prices and allowed for a rebound in previously constrained sectors, such as motor vehicles. Energy markets have adjusted faster than expected to the shock from Russia’s invasion of Ukraine." + "text": "The global fight against inflation, Russia’s war in Ukraine, and a resurgence of COVID-19 in China weighed on global economic activity in 2022, and the first two factors will continue to do so in 2023." }, { "type": "NarrativeText", - "element_id": "42213af1ed4e31e1ce00eba6ce07ee5e", + "element_id": "041668dbcf5b0c4114acae7ef393f5cd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -177,11 +195,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "In the fourth quarter of 2022, however, this uptick is estimated to have faded in most—though not all––major economies. US growth remains stronger than expected, with consumers continuing to spend from their stock of savings (the personal saving rate is at its lowest in more than 60 years, except for July 2005), unemployment near historic lows, and plentiful job opportunities. But elsewhere, high-frequency activity indicators (such as business and consumer sentiment, purchasing manager surveys, and mobility indicators) generally point to a slowdown." + "text": "Despite these headwinds, real GDP was surprisingly strong in the third quarter of 2022 in numerous economies, including the United States, the euro area, and major emerging market and developing economies. The sources of these surprises were in many cases domestic: stronger-than-expected private consumption and investment amid tight labor markets and greater-than-anticipated fiscal support. Households spent more to satisfy pent-up demand, particularly on services, partly by drawing down their stock of savings as economies reopened. Business investment rose to meet demand. On the supply side, easing bottlenecks and declining transportation costs reduced pressures on input prices and allowed for a rebound in previously constrained sectors, such as motor vehicles. Energy markets have adjusted faster than expected to the shock from Russia’s invasion of Ukraine." }, { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", + "type": "NarrativeText", + "element_id": "42213af1ed4e31e1ce00eba6ce07ee5e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -195,11 +213,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "International Monetary Fund | January 2023" + "text": "In the fourth quarter of 2022, however, this uptick is estimated to have faded in most—though not all––major economies. US growth remains stronger than expected, with consumers continuing to spend from their stock of savings (the personal saving rate is at its lowest in more than 60 years, except for July 2005), unemployment near historic lows, and plentiful job opportunities. But elsewhere, high-frequency activity indicators (such as business and consumer sentiment, purchasing manager surveys, and mobility indicators) generally point to a slowdown." }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "ListItem", + "element_id": "c99869e52743869e29fd645e9e0df6fb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -213,7 +231,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "1" + "text": "International Monetary Fund | January 2023 1" }, { "type": "Title", @@ -289,7 +307,7 @@ }, { "type": "UncategorizedText", - "element_id": "28a5aa3897d66de6c31caba99a4c337e", + "element_id": "c2c7be4534a60790d1d18451c91dc138", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -303,11 +321,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "–2" + "text": "16 14 12 10 8 6 4 2 0" }, { "type": "UncategorizedText", - "element_id": "c2c7be4534a60790d1d18451c91dc138", + "element_id": "28a5aa3897d66de6c31caba99a4c337e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -321,11 +339,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "16 14 12 10 8 6 4 2 0" + "text": "–2" }, { - "type": "UncategorizedText", - "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", + "type": "Title", + "element_id": "323d79e74460eda1fb0f8d55a2e0ff42", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -339,11 +357,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Jan. 2019" + "text": "Median country Brazil" }, { - "type": "UncategorizedText", - "element_id": "c7c72889cb49cf43d9bd1f892db1be2c", + "type": "Title", + "element_id": "49dca65f362fee401292ed7ada96f962", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -357,11 +375,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Jan. 2019" + "text": "United States" }, { - "type": "ListItem", - "element_id": "63e35649dd179389ecc7251e1503489a", + "type": "Title", + "element_id": "007b2203e9e86a49c3108e9ffd16fbbc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -375,11 +393,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "1. Headline Inflation" + "text": "Euro area" }, { - "type": "ListItem", - "element_id": "b790ab5fcad28bbedb50b568b3adeca2", + "type": "Title", + "element_id": "cc874418b59b7ecb37a2c938783fb5ce", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -393,11 +411,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "2. Core Inflation" + "text": "Nov. 22" }, { "type": "Title", - "element_id": "323d79e74460eda1fb0f8d55a2e0ff42", + "element_id": "cc874418b59b7ecb37a2c938783fb5ce", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -411,11 +429,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Median country Brazil" + "text": "Nov. 22" }, { - "type": "Title", - "element_id": "646612b0a62b59fd13be769b4590a9ac", + "type": "NarrativeText", + "element_id": "6814df88a59d11e9fcf76a7ed0f5fdfc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -429,11 +447,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Jul. 19" + "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience––which is" }, { - "type": "Title", - "element_id": "646612b0a62b59fd13be769b4590a9ac", + "type": "ListItem", + "element_id": "3a162049bc9ee88b56d4d4bf5897368f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -447,11 +465,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Jul. 19" + "text": "2 International Monetary Fund | January 2023" }, { "type": "Title", - "element_id": "7a4f82ed474f82c26a8b867becaf89ba", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -463,13 +481,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jan. 20" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "Title", - "element_id": "7a4f82ed474f82c26a8b867becaf89ba", + "type": "NarrativeText", + "element_id": "83ce77349b07c275543d551c2c016370", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -481,13 +499,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jan. 20" + "text": "visible in consumption and investment data for the third quarter––partly reflects government support of about 1.2 percent of European Union GDP (net budgetary cost) to households and firms hit by the energy crisis, as well as dynamism from economies reopening. Gas prices have declined by more than expected amid higher non-Russian pipeline and liquefied natural gas flows, compression of demand for gas, and a warmer-than-usual winter. However, the boost from reopening appears to be fading. High-frequency indicators for the fourth quarter suggest that the manufacturing and services sectors are contracting. Consumer confidence and business sentiment have worsened. With inflation at about 10 percent or above in several euro area countries and the United Kingdom, household budgets remain stretched. The accelerated pace of rate increases by the Bank of England and the European Central Bank is tightening financial conditions and cooling demand in the housing sector and beyond." }, { "type": "Title", - "element_id": "6d2f5e3c057e12c92023d5501c3fd075", + "element_id": "26a20452d058d66ad402559f659cec7c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -499,13 +517,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jul. 20" + "text": "The Forecast" }, { "type": "Title", - "element_id": "6d2f5e3c057e12c92023d5501c3fd075", + "element_id": "5779b9b7d25794d3b4ed1fe4e61f6617", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -517,13 +535,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jul. 20" + "text": "Growth Bottoming Out" }, { - "type": "Title", - "element_id": "49dca65f362fee401292ed7ada96f962", + "type": "NarrativeText", + "element_id": "22011dc596eec73711d7dac8d99b41b6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -535,13 +553,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "United States" + "text": "Global growth, estimated at 3.4 percent in 2022, is projected to fall to 2.9 percent in 2023 before rising to 3.1 percent in 2024 (Table 1). Compared with the October forecast, the estimate for 2022 and the forecast for 2023 are both higher by about 0.2 percentage point, reflecting positive surprises and greater-than-expected resilience in numerous economies. Negative growth in global GDP or global GDP per capita—which often happens when there is a global recession—is not expected. Nevertheless, global growth projected for 2023 and 2024 is below the historical (2000–19) annual average of 3.8 percent." }, { - "type": "Title", - "element_id": "f4a93992a1b09b3fa6200542fd6fde5a", + "type": "NarrativeText", + "element_id": "97e04ee873fea0151df00f7b1fb4ca42", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -553,13 +571,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jan. 21" + "text": "The forecast of low growth in 2023 reflects the rise in central bank rates to fight inflation–– especially in advanced economies––as well as the war in Ukraine. The decline in growth in 2023 from 2022 is driven by advanced economies; in emerging market and developing economies, growth is estimated to have bottomed out in 2022. Growth is expected to pick up in China with the full reopening in 2023. The expected pickup in 2024 in both groups of economies reflects gradual recovery from the effects of the war in Ukraine and subsiding inflation. Following the path of global demand, world trade growth is expected to decline in 2023 to 2.4 percent, despite an easing of supply bottlenecks, before rising to 3.4 percent in 2024." }, { - "type": "Title", - "element_id": "f4a93992a1b09b3fa6200542fd6fde5a", + "type": "NarrativeText", + "element_id": "e08dfaba8a8dc7496a44cb172319d4ba", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -571,13 +589,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jan. 21" + "text": "These forecasts are based on a number of assumptions, including on fuel and nonfuel commodity prices, which have generally been revised down since October, and on interest rates, which have been revised up. In 2023, oil prices are projected to fall by about 16 percent, while nonfuel commodity prices are expected to fall by, on average, 6.3 percent. Global interest rate assumptions are revised up, reflecting intensified actual and signaled policy tightening by major central banks since October." }, { - "type": "Title", - "element_id": "81db94f58819ee2fd6c05ddef2082ccc", + "type": "NarrativeText", + "element_id": "73a39336fb540e7d57ec85dfa8e92799", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -589,13 +607,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jul. 21" + "text": "For advanced economies, growth is projected to decline sharply from 2.7 percent in 2022 to 1.2 percent in 2023 before rising to 1.4 percent in 2024, with a downward revision of 0.2 percentage point for 2024. About 90 percent of advanced economies are projected to see a decline in growth in 2023." }, { - "type": "Title", - "element_id": "81db94f58819ee2fd6c05ddef2082ccc", + "type": "ListItem", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -607,13 +625,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jul. 21" + "text": "" }, { - "type": "Title", - "element_id": "babfe67b3ecc6b32db9adb9da08274bf", + "type": "ListItem", + "element_id": "e84075ae46df9d9ad37d947011c05a7f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -625,13 +643,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Jan. 22" + "text": "In the United States, growth is projected to fall from 2.0 percent in 2022 to 1.4 percent in 2023 and 1.0 percent in 2024. With growth rebounding in the second half of 2024, growth in 2024 will be faster than in 2023 on a fourth-quarter-over-fourth-quarter basis, as in most advanced" }, { - "type": "Title", - "element_id": "007b2203e9e86a49c3108e9ffd16fbbc", + "type": "ListItem", + "element_id": "ab9d11a9dd37cfd5e1876f40777a4480", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -643,13 +661,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 4 }, - "text": "Euro area" + "text": "International Monetary Fund | January 2023 3" }, { "type": "Title", - "element_id": "babfe67b3ecc6b32db9adb9da08274bf", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -661,13 +679,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Jan. 22" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "Title", - "element_id": "82debf5a182b9b394ad3a9d584a870ef", + "type": "NarrativeText", + "element_id": "67f04acf5353c625d003fd003acb56f3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -679,13 +697,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Jul. 22" + "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023." }, { - "type": "Title", - "element_id": "82debf5a182b9b394ad3a9d584a870ef", + "type": "ListItem", + "element_id": "075ec12daaf7e03f8ce608829f7ecdda", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -697,13 +715,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Jul. 22" + "text": "Growth in the ero area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers." }, { - "type": "Title", - "element_id": "cc874418b59b7ecb37a2c938783fb5ce", + "type": "ListItem", + "element_id": "531e21ce379680ba6ae82ebe340e897d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -715,13 +733,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Nov. 22" + "text": "Growth in the United Kingdom is projected to be —0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." }, { - "type": "Title", - "element_id": "cc874418b59b7ecb37a2c938783fb5ce", + "type": "ListItem", + "element_id": "968cc16a6f05e1f4c40da05632df9609", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -733,13 +751,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Nov. 22" + "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, { "type": "NarrativeText", - "element_id": "6814df88a59d11e9fcf76a7ed0f5fdfc", + "element_id": "497b28af5c258708a114b8a6766662ce", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -751,13 +769,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "Winter comes to Europe. European economic growth in 2022 was more resilient than expected in the face of the large negative terms-of-trade shock from the war in Ukraine. This resilience––which is" + "text": "For emerging market and developing economies, growth is projected to rise modestly, from 3.9 percent in 2022 to 4.0 percent in 2023 and 4.2 percent in 2024, with an upward revision of 0.3 percentage point for 2023 and a downward revision of 0.1 percentage point for 2024. About half of emerging market and developing economies have lower growth in 2023 than in 2022." }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "ListItem", + "element_id": "74af5288c060a6b7bc028cc0efcf59ea", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -769,13 +787,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "2" + "text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." }, { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", + "type": "ListItem", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -787,13 +805,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 5 }, - "text": "International Monetary Fund | January 2023" + "text": "" }, { - "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "type": "ListItem", + "element_id": "afde979c99a73646915fe253c85c5a9c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -805,13 +823,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, { - "type": "NarrativeText", - "element_id": "83ce77349b07c275543d551c2c016370", + "type": "ListItem", + "element_id": "25072141a0ed1c9474256def9a721513", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -823,13 +841,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": "visible in consumption and investment data for the third quarter––partly reflects government support of about 1.2 percent of European Union GDP (net budgetary cost) to households and firms hit by the energy crisis, as well as dynamism from economies reopening. Gas prices have declined by more than expected amid higher non-Russian pipeline and liquefied natural gas flows, compression of demand for gas, and a warmer-than-usual winter. However, the boost from reopening appears to be fading. High-frequency indicators for the fourth quarter suggest that the manufacturing and services sectors are contracting. Consumer confidence and business sentiment have worsened. With inflation at about 10 percent or above in several euro area countries and the United Kingdom, household budgets remain stretched. The accelerated pace of rate increases by the Bank of England and the European Central Bank is tightening financial conditions and cooling demand in the housing sector and beyond." + "text": "4 International Monetary Fund | January 2023" }, { "type": "Title", - "element_id": "26a20452d058d66ad402559f659cec7c", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -841,13 +859,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "The Forecast" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "Title", - "element_id": "5779b9b7d25794d3b4ed1fe4e61f6617", + "type": "NarrativeText", + "element_id": "c9b8a2f221ce7ec3213fcf4d9ce8879c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -859,13 +877,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "Growth Bottoming Out" + "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." }, { - "type": "NarrativeText", - "element_id": "22011dc596eec73711d7dac8d99b41b6", + "type": "ListItem", + "element_id": "25e2f1dc031b5421b8a234945098e58b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -877,13 +895,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "Global growth, estimated at 3.4 percent in 2022, is projected to fall to 2.9 percent in 2023 before rising to 3.1 percent in 2024 (Table 1). Compared with the October forecast, the estimate for 2022 and the forecast for 2023 are both higher by about 0.2 percentage point, reflecting positive surprises and greater-than-expected resilience in numerous economies. Negative growth in global GDP or global GDP per capita—which often happens when there is a global recession—is not expected. Nevertheless, global growth projected for 2023 and 2024 is below the historical (2000–19) annual average of 3.8 percent." + "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." }, { - "type": "NarrativeText", - "element_id": "97e04ee873fea0151df00f7b1fb4ca42", + "type": "Title", + "element_id": "3dfc45d3333ae253d78008c8cde2d752", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -895,13 +913,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "The forecast of low growth in 2023 reflects the rise in central bank rates to fight inflation–– especially in advanced economies––as well as the war in Ukraine. The decline in growth in 2023 from 2022 is driven by advanced economies; in emerging market and developing economies, growth is estimated to have bottomed out in 2022. Growth is expected to pick up in China with the full reopening in 2023. The expected pickup in 2024 in both groups of economies reflects gradual recovery from the effects of the war in Ukraine and subsiding inflation. Following the path of global demand, world trade growth is expected to decline in 2023 to 2.4 percent, despite an easing of supply bottlenecks, before rising to 3.4 percent in 2024." + "text": "Inflation Peaking" }, { "type": "NarrativeText", - "element_id": "e08dfaba8a8dc7496a44cb172319d4ba", + "element_id": "d24af8f44bd419665bb4ab6efef34fed", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -913,13 +931,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "These forecasts are based on a number of assumptions, including on fuel and nonfuel commodity prices, which have generally been revised down since October, and on interest rates, which have been revised up. In 2023, oil prices are projected to fall by about 16 percent, while nonfuel commodity prices are expected to fall by, on average, 6.3 percent. Global interest rate assumptions are revised up, reflecting intensified actual and signaled policy tightening by major central banks since October." + "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies." }, { "type": "NarrativeText", - "element_id": "73a39336fb540e7d57ec85dfa8e92799", + "element_id": "72d289ea524eebcd8f195a8afda1c223", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -931,13 +949,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "For advanced economies, growth is projected to decline sharply from 2.7 percent in 2022 to 1.2 percent in 2023 before rising to 1.4 percent in 2024, with a downward revision of 0.2 percentage point for 2024. About 90 percent of advanced economies are projected to see a decline in growth in 2023." + "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average." }, { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "type": "Title", + "element_id": "11ebd9f4c9a7cdbac41f8f7399d3950e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -949,13 +967,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "" + "text": "Risks to the Outlook" }, { "type": "NarrativeText", - "element_id": "e84075ae46df9d9ad37d947011c05a7f", + "element_id": "818b1bd0fa9714f9ce4623897ba422a8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -967,13 +985,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "In the United States, growth is projected to fall from 2.0 percent in 2022 to 1.4 percent in 2023 and 1.0 percent in 2024. With growth rebounding in the second half of 2024, growth in 2024 will be faster than in 2023 on a fourth-quarter-over-fourth-quarter basis, as in most advanced" + "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook." }, { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", + "type": "ListItem", + "element_id": "30c61ae1849c6b38dd09c21d3d4f5951", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -985,13 +1003,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "International Monetary Fund | January 2023" + "text": "International Monetary Fund | January 2023. 5" }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Title", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1003,13 +1021,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 7 }, - "text": "3" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "element_id": "1ad611b76683e54171ae0b1fddd827ca", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1021,13 +1039,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)" }, { - "type": "ListItem", - "element_id": "becf96ae2fa1045c14996c3de7a05bb8", + "type": "Title", + "element_id": "d11a1c04bd3a9891350b4bd94104df58", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1039,13 +1057,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023. Growth in the ero area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers. Growth in the United Kingdom is projected to be —0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets. Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." + "text": "Year over Year" }, { - "type": "NarrativeText", - "element_id": "497b28af5c258708a114b8a6766662ce", + "type": "Title", + "element_id": "aa22eb2e58c7cf45c528550d68e15c51", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1057,13 +1075,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "For emerging market and developing economies, growth is projected to rise modestly, from 3.9 percent in 2022 to 4.0 percent in 2023 and 4.2 percent in 2024, with an upward revision of 0.3 percentage point for 2023 and a downward revision of 0.1 percentage point for 2024. About half of emerging market and developing economies have lower growth in 2023 than in 2022." + "text": "Difference from October 2022" }, { - "type": "ListItem", - "element_id": "2ba41350ae3c684802f0e2b785c2d11b", + "type": "Title", + "element_id": "8c327a62ae0e925498f5c68b819b32b4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1075,13 +1093,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2" + "text": "Q4 over Q4 2/" }, { - "type": "ListItem", - "element_id": "bba948699d4f21aaf5001520bb796e17", + "type": "Title", + "element_id": "fcadc00fe663ee0e7818b0ffc5c46948", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1093,13 +1111,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to tise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024. Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Rwssia in 2022 (estimated at —2.2 percent compared with a predicted —3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgtades of 0.2 percentage point for Brazi/ and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" + "text": "World Output" }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "0c76bc4e35219e2a31b09428cd47d009", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1111,13 +1129,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "4" + "text": "World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies" }, { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", + "type": "NarrativeText", + "element_id": "3c0578f4d944258ffa4ffac7615f1ff9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1129,13 +1147,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "International Monetary Fund | January 2023" + "text": "Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights)" }, { "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "element_id": "b2800ff802361713acee893ebae272f6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1147,13 +1165,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + "text": "Saudi Arabia Sub-Saharan Africa" }, { - "type": "ListItem", - "element_id": "e0fc62fcfa1add3cf912fbaf3e0c9ba1", + "type": "Title", + "element_id": "6185fd66a4e106814e65c047c15dfb1f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1165,13 +1183,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": " Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." + "text": "Advanced Economies United States Euro Area" }, { "type": "Title", - "element_id": "3dfc45d3333ae253d78008c8cde2d752", + "element_id": "24af2841400373443d80b6c91180918b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1183,13 +1201,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "Inflation Peaking" + "text": "Middle East and Central Asia" }, { - "type": "NarrativeText", - "element_id": "d24af8f44bd419665bb4ab6efef34fed", + "type": "Title", + "element_id": "7559320d044a32fbb21a7a8da25e9045", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1201,13 +1219,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "About 84 percent of countries are expected to have lower headline (consumer price index) inflation in 2023 than in 2022. Global inflation is set to fall from 8.8 percent in 2022 (annual average) to 6.6 percent in 2023 and 4.3 percent in 2024––above pre-pandemic (2017–19) levels of about 3.5 percent. The projected disinflation partly reflects declining international fuel and nonfuel commodity prices due to weaker global demand. It also reflects the cooling effects of monetary policy tightening on underlying (core) inflation, which globally is expected to decline from 6.9 percent in the fourth quarter of 2022 (year over year) to 4.5 percent by the fourth quarter of 2023. Still, disinflation will take time: by 2024, projected annual average headline and core inflation will, respectively, still be above pre-pandemic levels in 82 percent and 86 percent of economies." + "text": "Japan United Kingdom Canada Other Advanced Economies 3/" }, { - "type": "NarrativeText", - "element_id": "72d289ea524eebcd8f195a8afda1c223", + "type": "Title", + "element_id": "ad1094978303f5aa32665083ee1ed934", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1219,13 +1237,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "In advanced economies, annual average inflation is projected to decline from 7.3 percent in 2022 to 4.6 percent in 2023 and 2.6 percent in 2024––above target in several cases. In emerging market and developing economies, projected annual inflation declines from 9.9 percent in 2022 to 8.1 percent in 2023 and 5.5 percent in 2024, above the 4.9 percent pre-pandemic (2017–19) average. In low-income developing countries, inflation is projected to moderate from 14.2 percent in 2022 to 8.6 percent in 2024––still high, but close to the pre-pandemic average." + "text": "Latin America and the Caribbean" }, { "type": "Title", - "element_id": "11ebd9f4c9a7cdbac41f8f7399d3950e", + "element_id": "8325885b8155742cebc672e0d7072a7d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1237,13 +1255,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "Risks to the Outlook" + "text": "Emerging and Developing Europe" }, { - "type": "NarrativeText", - "element_id": "818b1bd0fa9714f9ce4623897ba422a8", + "type": "Title", + "element_id": "a4ca51cd6c74adf51f6e9ce60165d047", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1255,13 +1273,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "The balance of risks to the global outlook remains tilted to the downside, with scope for lower growth and higher inflation, but adverse risks have moderated since the October 2022 World Economic Outlook." + "text": "Emerging Market and Developing Economies Emerging and Developing Asia" }, { - "type": "Title", - "element_id": "8ae18586f23aa212e66aeb12a5638609", + "type": "UncategorizedText", + "element_id": "9e5246f529e197f84af65bbcd8e0d2a4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1273,13 +1291,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "International Monetary Fund | January 2023." + "text": "Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "e30a554d7d1cbf308651f8c267ad6872", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1291,13 +1309,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 7 }, - "text": "5" + "text": "Brazil Mexico" }, { "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", + "element_id": "33a3d8ed92b0709ba525369922e51387", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1311,11 +1329,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" + "text": "Russia" }, { - "type": "NarrativeText", - "element_id": "1ad611b76683e54171ae0b1fddd827ca", + "type": "Title", + "element_id": "d5d29f012a1237803ee7e623a134117a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1329,11 +1347,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Table 1. Overview of the World Economic Outlook Projections (Percent change, unless noted otherwise)" + "text": "China India 4/" }, { "type": "Title", - "element_id": "d11a1c04bd3a9891350b4bd94104df58", + "element_id": "18231df9f753f2eca887585247231761", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1347,11 +1365,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Year over Year" + "text": "Germany France Italy Spain" }, { "type": "Title", - "element_id": "aa22eb2e58c7cf45c528550d68e15c51", + "element_id": "05704f84f4326b5f53a04d62f7ad62fc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1365,11 +1383,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Difference from October 2022" + "text": "Nigeria South Africa" }, { - "type": "Title", - "element_id": "8c327a62ae0e925498f5c68b819b32b4", + "type": "Table", + "element_id": "af79981b9ad6dea2ab3fa92cb5954958", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1383,11 +1401,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Q4 over Q4 2/" + "text": "over Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2" }, { - "type": "Title", - "element_id": "fcadc00fe663ee0e7818b0ffc5c46948", + "type": "UncategorizedText", + "element_id": "1bea20e1df19b12013976de2b5e0e3d1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1401,11 +1419,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "World Output" + "text": "2021" }, { "type": "UncategorizedText", - "element_id": "6bb1e757e09d7fa3aba323a375abd047", + "element_id": "b432234c878eb484525dbb0c9be461fe", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1419,11 +1437,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/" + "text": "65.8 26.4" }, { "type": "UncategorizedText", - "element_id": "0c76bc4e35219e2a31b09428cd47d009", + "element_id": "e4fe15854d6650b5b102d8b1c11eb0ba", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1437,11 +1455,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies" + "text": "10.4 9.4 12.1" }, { - "type": "NarrativeText", - "element_id": "3c0578f4d944258ffa4ffac7615f1ff9", + "type": "UncategorizedText", + "element_id": "2a9680555d457b6da4b6748492bb6f3d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1455,11 +1473,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights)" + "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3" }, { - "type": "Title", - "element_id": "b2800ff802361713acee893ebae272f6", + "type": "UncategorizedText", + "element_id": "a7143daa9de8af6e0c465ca1354d45b6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1473,11 +1491,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Saudi Arabia Sub-Saharan Africa" + "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9" }, { - "type": "Title", - "element_id": "6185fd66a4e106814e65c047c15dfb1f", + "type": "UncategorizedText", + "element_id": "dbc6d298b0672b8176de90a623844b7f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1491,11 +1509,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Advanced Economies United States Euro Area" + "text": "6.0 5.5 3.8 4.1 7.0 4.1" }, { - "type": "Title", - "element_id": "24af2841400373443d80b6c91180918b", + "type": "UncategorizedText", + "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1509,11 +1527,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Middle East and Central Asia" + "text": "6.2" }, { "type": "Title", - "element_id": "7559320d044a32fbb21a7a8da25e9045", + "element_id": "b88d850d87e55cb1fd14ae67e5644d57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1527,11 +1545,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Japan United Kingdom Canada Other Advanced Economies 3/" + "text": "Estimate 2022" }, { - "type": "Title", - "element_id": "8325885b8155742cebc672e0d7072a7d", + "type": "UncategorizedText", + "element_id": "53bcbc5ff007dd49a07f6fb79ef96ef9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1545,11 +1563,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Emerging and Developing Europe" + "text": "3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6" }, { - "type": "Title", - "element_id": "ad1094978303f5aa32665083ee1ed934", + "type": "UncategorizedText", + "element_id": "1baf3bebf4d4c9418858185bd491eb8f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1563,11 +1581,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Latin America and the Caribbean" + "text": "39.8 7.0" }, { "type": "UncategorizedText", - "element_id": "9e5246f529e197f84af65bbcd8e0d2a4", + "element_id": "743f3bc42f087068035515a8dec4f85a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1581,11 +1599,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries" + "text": "3.1 3.7 5.2 5.4 3.8 4.9" }, { - "type": "Title", - "element_id": "a4ca51cd6c74adf51f6e9ce60165d047", + "type": "UncategorizedText", + "element_id": "72d73db944cf6d9a5f11d6c073c1dce0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1599,11 +1617,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Emerging Market and Developing Economies Emerging and Developing Asia" + "text": "3.4" }, { - "type": "Title", - "element_id": "33a3d8ed92b0709ba525369922e51387", + "type": "UncategorizedText", + "element_id": "e352203d837b1096ee96e1977f1c3d0b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1617,11 +1635,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Russia" + "text": "5.4 6.6 3.4" }, { - "type": "Title", - "element_id": "e30a554d7d1cbf308651f8c267ad6872", + "type": "UncategorizedText", + "element_id": "6976f35f9f91b539b46743f37d94014a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1635,11 +1653,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Brazil Mexico" + "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8" }, { - "type": "Title", - "element_id": "d5d29f012a1237803ee7e623a134117a", + "type": "UncategorizedText", + "element_id": "7268a41308c4276447de2a707b5df73c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1653,11 +1671,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "China India 4/" + "text": "–16.2 –6.3" }, { "type": "Title", - "element_id": "18231df9f753f2eca887585247231761", + "element_id": "18665f77847d326417463628d8860261", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1671,11 +1689,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Germany France Italy Spain" + "text": "Projections 2023" }, { - "type": "Title", - "element_id": "05704f84f4326b5f53a04d62f7ad62fc", + "type": "UncategorizedText", + "element_id": "d8236eb6a9bab4f3d37735048ab5aeee", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1689,11 +1707,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Nigeria South Africa" + "text": "1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0" }, { - "type": "Table", - "element_id": "63bdc79def2500227001ac95d78727ab", + "type": "UncategorizedText", + "element_id": "1ea8f3c3db2cb6c75f21ebf26acc28a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1707,11 +1725,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45," + "text": "4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2" }, { "type": "UncategorizedText", - "element_id": "1bea20e1df19b12013976de2b5e0e3d1", + "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1725,11 +1743,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2021" + "text": "2.9" }, { "type": "UncategorizedText", - "element_id": "e4fe15854d6650b5b102d8b1c11eb0ba", + "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1743,11 +1761,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "10.4 9.4 12.1" + "text": "2.4 0.7 4.3 3.2 4.0 4.9" }, { "type": "UncategorizedText", - "element_id": "b432234c878eb484525dbb0c9be461fe", + "element_id": "098d858ff74b2740723330ff6e43edf8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1761,11 +1779,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "65.8 26.4" + "text": "2.4 2.3 2.6" }, { "type": "UncategorizedText", - "element_id": "a7143daa9de8af6e0c465ca1354d45b6", + "element_id": "6557739a67283a8de383fc5c0997fbec", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1779,11 +1797,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9" + "text": "2024" }, { "type": "UncategorizedText", - "element_id": "69dfc187e2e6d907a0546f7e76f8ee3f", + "element_id": "cf39ab5ed0773cea3681c2ac35e6b706", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1797,11 +1815,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.2" + "text": "–7.1 –0.4" }, { "type": "UncategorizedText", - "element_id": "2a9680555d457b6da4b6748492bb6f3d", + "element_id": "7fdc64e781146808df57eac112860f9b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1815,11 +1833,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3" + "text": "3.4 2.7 4.6" }, { "type": "UncategorizedText", - "element_id": "dbc6d298b0672b8176de90a623844b7f", + "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1833,11 +1851,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.0 5.5 3.8 4.1 7.0 4.1" + "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3" }, { "type": "UncategorizedText", - "element_id": "9db439c530ed3425c0a68724de199942", + "element_id": "35efc6ded4e13f29a8d86e4f33294be0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1851,11 +1869,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.7 3.1 5.9" + "text": "3.1" }, { - "type": "Title", - "element_id": "b88d850d87e55cb1fd14ae67e5644d57", + "type": "UncategorizedText", + "element_id": "123157612cd26d61b4760a5ecd1f4bfc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1869,11 +1887,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Estimate 2022" + "text": "2.5 1.8 4.7 3.5 4.1 5.6" }, { "type": "UncategorizedText", - "element_id": "1baf3bebf4d4c9418858185bd491eb8f", + "element_id": "777e0063772d428bf1c04383b8ad058e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1887,11 +1905,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "39.8 7.0" + "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4" }, { - "type": "UncategorizedText", - "element_id": "53bcbc5ff007dd49a07f6fb79ef96ef9", + "type": "Title", + "element_id": "1968c7f7ac8a3b0483f733357bb50b16", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1905,11 +1923,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6" + "text": "WEO Projections 1/" }, { "type": "UncategorizedText", - "element_id": "b7948d6976e997e76e343161b4b5d864", + "element_id": "d398b29d3dbbb9bf201d4c7e1c19ff9d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1923,11 +1941,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "8.8 7.3 9.9" + "text": "2023" }, { "type": "UncategorizedText", - "element_id": "72d73db944cf6d9a5f11d6c073c1dce0", + "element_id": "e06f96c6cf56b11e98615192247171fa", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1941,11 +1959,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.4" + "text": "0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1" }, { "type": "UncategorizedText", - "element_id": "e352203d837b1096ee96e1977f1c3d0b", + "element_id": "84bc47d0d0703878a250620230630525", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1959,11 +1977,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "5.4 6.6 3.4" + "text": "–3.3 –0.1" }, { "type": "UncategorizedText", - "element_id": "743f3bc42f087068035515a8dec4f85a", + "element_id": "d35a737537febb07f01925c873444cbc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1977,11 +1995,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.1 3.7 5.2 5.4 3.8 4.9" + "text": "–0.1 0.0 –0.3" }, { "type": "UncategorizedText", - "element_id": "6976f35f9f91b539b46743f37d94014a", + "element_id": "effb80722a72ecff482b7a0d4a027e78", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1995,29 +2013,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8" + "text": "0.3 0.0 –0.2 –0.4 0.4 0.0" }, { "type": "UncategorizedText", - "element_id": "7268a41308c4276447de2a707b5df73c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "–16.2 –6.3" - }, - { - "type": "Title", - "element_id": "18665f77847d326417463628d8860261", + "element_id": "f22875edf393e3502ad60c82e81c5933", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2031,11 +2031,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Projections 2023" + "text": "0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3" }, { "type": "UncategorizedText", - "element_id": "d8236eb6a9bab4f3d37735048ab5aeee", + "element_id": "44896b09365746b5f7167ee4d64988a3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2049,11 +2049,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0" + "text": "0.2" }, { "type": "UncategorizedText", - "element_id": "1ea8f3c3db2cb6c75f21ebf26acc28a5", + "element_id": "6557739a67283a8de383fc5c0997fbec", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2067,11 +2067,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2" + "text": "2024" }, { "type": "UncategorizedText", - "element_id": "098d858ff74b2740723330ff6e43edf8", + "element_id": "4d702c47ea48fa0dca98ce691995cc1b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2085,11 +2085,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.4 2.3 2.6" + "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0" }, { "type": "UncategorizedText", - "element_id": "e7ac421147471fe341ae242e7544a44c", + "element_id": "037023840d334f9f357a6c3da2b058ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2103,11 +2103,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "6.6 4.6 8.1" + "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1" }, { "type": "UncategorizedText", - "element_id": "96ccb4fe1ec705d9944d1c1ecf0938ab", + "element_id": "4e6611d25d5013d40f58a6f82e3aecdf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2121,11 +2121,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.4 0.7 4.3 3.2 4.0 4.9" + "text": "–0.1" }, { "type": "UncategorizedText", - "element_id": "f491e65f8d4b8dbec7621fcedaf1b7a4", + "element_id": "2f6f72296f8ab115fda4292808436b88", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2139,11 +2139,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.9" + "text": "–0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2" }, { "type": "UncategorizedText", - "element_id": "6557739a67283a8de383fc5c0997fbec", + "element_id": "7ac5e2e700f401ccf7d2c4770d3afd44", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2157,11 +2157,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2024" + "text": "–0.3 –0.4 0.0" }, { "type": "UncategorizedText", - "element_id": "cf39ab5ed0773cea3681c2ac35e6b706", + "element_id": "ebb1568088af8b7c7b98878b895decaf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2175,11 +2175,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–7.1 –0.4" + "text": "–0.9 0.3" }, { - "type": "UncategorizedText", - "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed", + "type": "Title", + "element_id": "b88d850d87e55cb1fd14ae67e5644d57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2193,11 +2193,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "4.3 2.6 5.5" + "text": "Estimate 2022" }, { "type": "UncategorizedText", - "element_id": "777e0063772d428bf1c04383b8ad058e", + "element_id": "3d5c2c97e00e0c5be2a870cf1cbaac06", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2211,11 +2211,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4" + "text": "11.2 –2.0" }, { "type": "UncategorizedText", - "element_id": "35efc6ded4e13f29a8d86e4f33294be0", + "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2229,11 +2229,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.1" + "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0" }, { "type": "UncategorizedText", - "element_id": "123157612cd26d61b4760a5ecd1f4bfc", + "element_id": "4d5d14d8c932363fe84036564c6c582b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2247,11 +2247,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.5 1.8 4.7 3.5 4.1 5.6" + "text": "1.7 1.8 3.7 . . . 2.5 . . ." }, { "type": "UncategorizedText", - "element_id": "7fdc64e781146808df57eac112860f9b", + "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2265,29 +2265,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "3.4 2.7 4.6" + "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4" }, { "type": "UncategorizedText", - "element_id": "9d1bc5abd6f3e9c4c6ccb572ae521387", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3" - }, - { - "type": "Title", - "element_id": "1968c7f7ac8a3b0483f733357bb50b16", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2301,11 +2283,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "WEO Projections 1/" + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", - "element_id": "d398b29d3dbbb9bf201d4c7e1c19ff9d", + "element_id": "eca06fdd26e513a7b8510c8660228504", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2319,11 +2301,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2023" + "text": "1.9" }, { - "type": "UncategorizedText", - "element_id": "84bc47d0d0703878a250620230630525", + "type": "Title", + "element_id": "18665f77847d326417463628d8860261", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2337,11 +2319,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–3.3 –0.1" + "text": "Projections 2023" }, { "type": "UncategorizedText", - "element_id": "effb80722a72ecff482b7a0d4a027e78", + "element_id": "1a009e8c6bb6dada03c326655a15bedf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2355,11 +2337,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.3 0.0 –0.2 –0.4 0.4 0.0" + "text": "1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1" }, { "type": "UncategorizedText", - "element_id": "e06f96c6cf56b11e98615192247171fa", + "element_id": "4150b86a3fffd48fc159e81c9b7325db", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2373,11 +2355,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1" + "text": "–9.8 1.4" }, { "type": "UncategorizedText", - "element_id": "d35a737537febb07f01925c873444cbc", + "element_id": "f4e79a2ba19a5b842cff288f8e4eafd0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2391,11 +2373,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1 0.0 –0.3" + "text": "5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5" }, { "type": "UncategorizedText", - "element_id": "f22875edf393e3502ad60c82e81c5933", + "element_id": "3135d2d71bff77be4838a7102bbac5b8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2409,11 +2391,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3" + "text": "3.2" }, { "type": "UncategorizedText", - "element_id": "44896b09365746b5f7167ee4d64988a3", + "element_id": "98e45a005510dc136e14094ee7ed7faf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2427,11 +2409,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.2" + "text": "2.5 1.2 5.7 . . . 5.0 . . ." }, { "type": "UncategorizedText", - "element_id": "5277334fd8abe869f6a8de2e43942c9d", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2445,7 +2427,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "0.1 0.2 0.0" + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", @@ -2467,43 +2449,7 @@ }, { "type": "UncategorizedText", - "element_id": "037023840d334f9f357a6c3da2b058ff", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "–0.1 –0.3 –0.2 0.2 0.0 0.1" - }, - { - "type": "UncategorizedText", - "element_id": "7ac5e2e700f401ccf7d2c4770d3afd44", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "–0.3 –0.4 0.0" - }, - { - "type": "UncategorizedText", - "element_id": "ebb1568088af8b7c7b98878b895decaf", + "element_id": "301b9fd38725258f32816ff1a855be3e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2517,11 +2463,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.9 0.3" + "text": "–5.9 –0.2" }, { "type": "UncategorizedText", - "element_id": "4e6611d25d5013d40f58a6f82e3aecdf", + "element_id": "39b99440eae2f9ee75cf98100c285787", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2535,11 +2481,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1" + "text": "2.5 2.0 4.0 . . . 4.1 . . ." }, { "type": "UncategorizedText", - "element_id": "4d702c47ea48fa0dca98ce691995cc1b", + "element_id": "a416ea84421fa7e1351582da48235bac", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2553,11 +2499,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0" + "text": "3.0" }, { "type": "UncategorizedText", - "element_id": "2f6f72296f8ab115fda4292808436b88", + "element_id": "07adb8acdd66b5d2490e542ae0604b71", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2571,29 +2517,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "–0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2" + "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" }, { "type": "UncategorizedText", - "element_id": "44f0ab7953bb0b3696b9fa3cf0682f35", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "0.2 0.2 0.2" - }, - { - "type": "Title", - "element_id": "b88d850d87e55cb1fd14ae67e5644d57", + "element_id": "708c57a76a5cf81dc197cc1bd612adb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2607,11 +2535,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Estimate 2022" + "text": ". . . . . . . . ." }, { "type": "UncategorizedText", - "element_id": "08e781dd2b6499b1ac8105a47f3520cc", + "element_id": "1776cf91dccdf2cce268fcee416b28f6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2625,11 +2553,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "9.2 7.8 10.4" + "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" }, { "type": "UncategorizedText", - "element_id": "3d5c2c97e00e0c5be2a870cf1cbaac06", + "element_id": "6bb1e757e09d7fa3aba323a375abd047", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2643,11 +2571,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "11.2 –2.0" + "text": "World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/" }, { "type": "UncategorizedText", - "element_id": "d7b26ee43ca5481505ca9eb7c3b29b2c", + "element_id": "9db439c530ed3425c0a68724de199942", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2661,929 +2589,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0" - }, - { - "type": "UncategorizedText", - "element_id": "eca06fdd26e513a7b8510c8660228504", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "1.9" - }, - { - "type": "UncategorizedText", - "element_id": "4d5d14d8c932363fe84036564c6c582b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "1.7 1.8 3.7 . . . 2.5 . . ." - }, - { - "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": ". . . . . . . . ." - }, - { - "type": "UncategorizedText", - "element_id": "eae9d4d60a1fe2df23f7b65ae3d76ca8", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4" - }, - { - "type": "Title", - "element_id": "18665f77847d326417463628d8860261", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Projections 2023" - }, - { - "type": "UncategorizedText", - "element_id": "4150b86a3fffd48fc159e81c9b7325db", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "–9.8 1.4" - }, - { - "type": "UncategorizedText", - "element_id": "1a009e8c6bb6dada03c326655a15bedf", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1" - }, - { - "type": "UncategorizedText", - "element_id": "e586cf66e92b356a4611ee2ffdf85a16", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "5.0 3.1 6.6" - }, - { - "type": "UncategorizedText", - "element_id": "98e45a005510dc136e14094ee7ed7faf", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "2.5 1.2 5.7 . . . 5.0 . . ." - }, - { - "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": ". . . . . . . . ." - }, - { - "type": "UncategorizedText", - "element_id": "3135d2d71bff77be4838a7102bbac5b8", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "3.2" - }, - { - "type": "UncategorizedText", - "element_id": "f4e79a2ba19a5b842cff288f8e4eafd0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5" - }, - { - "type": "UncategorizedText", - "element_id": "6557739a67283a8de383fc5c0997fbec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "2024" - }, - { - "type": "UncategorizedText", - "element_id": "301b9fd38725258f32816ff1a855be3e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "–5.9 –0.2" - }, - { - "type": "UncategorizedText", - "element_id": "39b99440eae2f9ee75cf98100c285787", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "2.5 2.0 4.0 . . . 4.1 . . ." - }, - { - "type": "UncategorizedText", - "element_id": "708c57a76a5cf81dc197cc1bd612adb2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": ". . . . . . . . ." - }, - { - "type": "UncategorizedText", - "element_id": "41d85a7cc007a9c34136a786d6e61c15", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "3.5 2.3 4.5" - }, - { - "type": "UncategorizedText", - "element_id": "1776cf91dccdf2cce268fcee416b28f6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2" - }, - { - "type": "UncategorizedText", - "element_id": "07adb8acdd66b5d2490e542ae0604b71", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8" - }, - { - "type": "UncategorizedText", - "element_id": "a416ea84421fa7e1351582da48235bac", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "3.0" - }, - { - "type": "NarrativeText", - "element_id": "dd295fca8aff81058c48312a022b69b2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024." - }, - { - "type": "NarrativeText", - "element_id": "df59a495ef85c5f70c5ba5356caf764a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:" - }, - { - "type": "ListItem", - "element_id": "cf20f95904c591b6ac4ccd5d43fa8a98", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal" - }, - { - "type": "ListItem", - "element_id": "000425958dcafe9c9a9c501237d8c4d3", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism." - }, - { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "6" - }, - { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "International Monetary Fund | January 2023" - }, - { - "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" - }, - { - "type": "ListItem", - "element_id": "79a6a9353dc2a500e2e50e720cf8ab7c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China. e Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." - }, - { - "type": "NarrativeText", - "element_id": "a2f806b25a06969405637298b4c85139", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" - }, - { - "type": "ListItem", - "element_id": "e9fbac47e4ed0c2d153022a284a77919", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "© = China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems. e = =War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing ptice spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase. e Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. e = Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy. e = Sudden financial market repricing: A prematute easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy. © Geopolitical fragmentation: The wat in Ukraine and the related international sanctions aimed at e pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." - }, - { - "type": "Title", - "element_id": "8ae18586f23aa212e66aeb12a5638609", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "International Monetary Fund | January 2023." - }, - { - "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "7" - }, - { - "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" - }, - { - "type": "NarrativeText", - "element_id": "6684fee3e3cd949ec59e7444a0c3fd0c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Fragmentation could intensify—with more restrictions on cross-border movements of capital, workers, and international payments—and could hamper multilateral cooperation on providing global public goods.1 The costs of such fragmentation are especially high in the short term, as replacing disrupted cross-border flows takes time." - }, - { - "type": "Title", - "element_id": "a81cc4e3ca23fd16254e2b858cdcb00a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Policy Priorities" - }, - { - "type": "NarrativeText", - "element_id": "1c464362698203e7245bdaf33c388a80", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Securing global disinflation: For most economies, the priority remains achieving a sustained reduction in inflation toward target levels. Raising real policy rates and keeping them above their neutral levels until underlying inflation is clearly declining would ward off risks of inflation expectations de- anchoring. Clear central bank communication and appropriate reactions to shifts in the data will help keep inflation expectations anchored and lessen wage and price pressures. Central banks’ balance sheets will need to be unwound carefully, amid market liquidity risks. Gradual and steady fiscal tightening would contribute to cooling demand and limit the burden on monetary policy in the fight against inflation. In countries where output remains below potential and inflation is in check, maintaining monetary and fiscal accommodation may be appropriate." - }, - { - "type": "NarrativeText", - "element_id": "d6138134f71f953a9da2083154e2629e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Containing the reemergence of COVID-19: Addressing the ongoing pandemic requires coordinated efforts to boost vaccination and medicine access in countries where coverage remains low as well as the deployment of pandemic preparedness measures—including a global push toward sequencing and sharing data. In China, focusing vaccination efforts on vulnerable groups and maintaining sufficiently high coverage of boosters and antiviral medicines would minimize the risks of severe health outcomes and safeguard the recovery, with favorable cross-border spillovers." - }, - { - "type": "NarrativeText", - "element_id": "2457fbbf5aa862b5a8b45d070f9114cb", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Ensuring financial stability: Depending on country circumstances, macroprudential tools can be used to tackle pockets of elevated financial sector vulnerabilities. Monitoring housing sector developments and conducting stress tests in economies where house prices have increased significantly over the past few years are warranted. In China, central government action to resolve the property crisis and reduce the risk of spillovers to financial stability and growth is a priority, including by strengthening temporary mechanisms to protect presale homebuyers from the risk of non-delivery and by restructuring troubled developers. Globally, financial sector regulations introduced after the global financial crisis have contributed to the resilience of banking sectors throughout the pandemic, but there is a need to address data and supervisory gaps in the less-regulated nonbank financial sector, where risks may have built up inconspicuously. Recent turmoil in the crypto space also highlights the urgent need to introduce common standards and reinforce oversight of crypto assets." - }, - { - "type": "NarrativeText", - "element_id": "bcef6ce9e3d4c015db21955dc4f6ce42", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Restoring debt sustainability: Lower growth and higher borrowing costs have raised public debt ratios in several economies. Where debt is unsustainable, implementing restructuring or reprofiling early on as part of a package of reforms (including fiscal consolidation and growth-enhancing supply-side reforms) can avert the need for more disruptive adjustment later." - }, - { - "type": "NarrativeText", - "element_id": "defb87cb8f10236768732a1e5fe9519f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "Supporting the vulnerable: The surge in global energy and food prices triggered a cost-of-living crisis. Governments acted swiftly with support to households and firms, which helped cushion effects on growth and at times limited the pass-through from energy prices to headline inflation through price" - }, - { - "type": "UncategorizedText", - "element_id": "40430ee7d1dc6b176a60b88df18a66c9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "1 See “Geo-Economic Fragmentation and the Future of Multilateralism,” IMF Staff Discussion Note 2023/001." - }, - { - "type": "UncategorizedText", - "element_id": "2c624232cdd221771294dfbb310aca00", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "8" - }, - { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 9 - }, - "text": "International Monetary Fund | January 2023" - }, - { - "type": "Title", - "element_id": "95af4f3feb2d03b2310ce31abc0c435d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" - }, - { - "type": "NarrativeText", - "element_id": "2e9a0eaddd75095d1bbb4fda6f2c4feb", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "controls. The temporary and broad-based measures are becoming increasingly costly and should be withdrawn and replaced by targeted approaches. Preserving the energy price signal will encourage a reduction in energy consumption and limit the risks of shortages. Targeting can be achieved through social safety nets such as cash transfers to eligible households based on income or demographics or by transfers through electricity companies based on past energy consumption. Subsidies should be temporary and offset by revenue-generating measures, including one-time solidarity taxes on high- income households and companies, where appropriate." - }, - { - "type": "NarrativeText", - "element_id": "da0ef04b13917f67583290e9ba57e375", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Reinforcing supply: Supply-side policies could address the key structural factors impeding growth— including market power, rent seeking, rigid regulation and planning, and inefficient education—and could help build resilience, reduce bottlenecks, and alleviate price pressures. A concerted push for investment along the supply chain of green energy technologies would bolster energy security and help advance progress on the green transition." - }, - { - "type": "NarrativeText", - "element_id": "c64f29a38dae74989484539db014364f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Strengthening multilateral cooperation—Urgent action is needed to limit the risks stemming from geopolitical fragmentation and to ensure cooperation on fundamental areas of common interest:" - }, - { - "type": "ListItem", - "element_id": "8dbc8ad2da37799a3719a01d44d2e506", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "e = Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. e = Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non— Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes. e — Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system. e Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks. e Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." - }, - { - "type": "Title", - "element_id": "b3080428cb4e8896623bf36c001e868a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "International Monetary Fund | January 2023" - }, - { - "type": "UncategorizedText", - "element_id": "19581e27de7ced00ff1ce50b2047e7a5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "9" - }, - { - "type": "Image", - "element_id": "0e1f5e74082ed333d383fa20680f0909", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 11 - }, - "text": "BOX 1. GLOBAL FINANCIAL STABILITY UPDATE" + "text": "4.7 3.1 5.9" }, { - "type": "NarrativeText", - "element_id": "8b350f34fe437a1447f2722c30d1e418", + "type": "UncategorizedText", + "element_id": "b7948d6976e997e76e343161b4b5d864", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3595,13 +2605,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "—— — other" + "text": "8.8 7.3 9.9" }, { - "type": "NarrativeText", - "element_id": "a2fa3a13e51ab7dd0859ee2c869b70e5", + "type": "UncategorizedText", + "element_id": "e7ac421147471fe341ae242e7544a44c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3613,13 +2623,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." + "text": "6.6 4.6 8.1" }, { "type": "UncategorizedText", - "element_id": "a43f5d32a34c9b54fe96097c3d491389", + "element_id": "4b48b0469ba9682a3e385ee7fbb6bbed", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3631,13 +2641,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "–3" + "text": "4.3 2.6 5.5" }, { "type": "UncategorizedText", - "element_id": "28a5aa3897d66de6c31caba99a4c337e", + "element_id": "5277334fd8abe869f6a8de2e43942c9d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3649,13 +2659,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "–2" + "text": "0.1 0.2 0.0" }, { "type": "UncategorizedText", - "element_id": "467792e5d9b6bec26f556875e9ccab10", + "element_id": "44f0ab7953bb0b3696b9fa3cf0682f35", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3667,13 +2677,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "–1" + "text": "0.2 0.2 0.2" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "08e781dd2b6499b1ac8105a47f3520cc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3685,13 +2695,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "1" + "text": "9.2 7.8 10.4" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "e586cf66e92b356a4611ee2ffdf85a16", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3703,13 +2713,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "2" + "text": "5.0 3.1 6.6" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "41d85a7cc007a9c34136a786d6e61c15", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3721,13 +2731,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "3" + "text": "3.5 2.3 4.5" }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "NarrativeText", + "element_id": "46c8e0c55b163d73d3d2766be8d1bf8d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3739,13 +2749,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "0" + "text": "Note: Real effective exchange rates are assumed to remain constant at the levels prevailing during October 26, 2022--November 23, 2022. Economies are listed on the basis of economic size. The aggregated quarterly data are seasonally adjusted. WEO = World Economic Outlook. 1/ Difference based on rounded figures for the current and October 2022 WEO forecasts. Countries whose forecasts have been updated relative to October 2022 WEO forecasts account for approximately 90 percent of world GDP measured at purchasing-power-parity weights. 2/ For World Output (Emerging Market and Developing Economies), the quarterly estimates and projections account for approximately 90 percent (80 percent) of annual world (emerging market and developing economies') output at purchasing-power-parity weights. 3/ Excludes the Group of Seven (Canada, France, Germany, Italy, Japan, United Kingdom, United States) and euro area countries. 4/ For India, data and projections are presented on a fiscal year basis, with FY 2022/23 (starting in April 2022) shown in the 2022 column. India's growth projections are 5.4 percent in 2023 and 6.8 percent in 2024 based on calendar year. 5/ Indonesia, Malaysia, Philippines, Singapore, Thailand. 6/ Simple average of growth rates for export and import volumes (goods and services). 7/ Simple average of prices of UK Brent, Dubai Fateh, and West Texas Intermediate crude oil. The average assumed price of oil in US dollars a barrel, based on futures markets (as of November 29, 2022), is $81.13 in 2023 and $75.36 in 2024. 8/ Excludes Venezuela. 9/ The inflation rate for the euro area is 5.7% in 2023 and 3.3% in 2024, that for Japan is 2.8% in 2023 and 2.0% in 2024, and that for the United States is 4.0% in 2023 and 2.2% in 2024." }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "NarrativeText", + "element_id": "df59a495ef85c5f70c5ba5356caf764a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3757,13 +2767,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "5" + "text": "Upside risks—Plausible upside risks include more favorable surprises to domestic spending—as in the third quarter of 2022—which, however, would increase inflation further. At the same time, there is room for an upside scenario with lower-than-expected inflation and less monetary tightening:" }, { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "type": "ListItem", + "element_id": "000425958dcafe9c9a9c501237d8c4d3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3775,13 +2785,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "6" + "text": "support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism." }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "ListItem", + "element_id": "f7d988c7d799cc7eec1527f363785a8c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3793,13 +2803,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 7 }, - "text": "4" + "text": "6 International Monetary Fund | January 2023" }, { - "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "type": "Title", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3811,13 +2821,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "7" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "UncategorizedText", - "element_id": "4108466a9a52ce87e39eb1836a42f6f2", + "type": "NarrativeText", + "element_id": "a6e6e147daf229e8267d85c3e49f7250", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3829,13 +2839,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "2006 08 08" + "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China." }, { - "type": "Title", - "element_id": "57de33ba9eaa9e5980d4cf6da83abf46", + "type": "ListItem", + "element_id": "2bbe57e6c291db638d3fcddca9e0199a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3847,13 +2857,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" + "text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to" }, { - "type": "NarrativeText", - "element_id": "1ac9d411aa1266cb68aba2a8a9b70379", + "type": "ListItem", + "element_id": "668cd3ea4f48a2f080b7b764c04ab011", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3865,13 +2875,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." + "text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." }, { - "type": "UncategorizedText", - "element_id": "aacd834b5cdc64a329e27649143406dd", + "type": "NarrativeText", + "element_id": "ab2ac0c0c558600b645acb6349ccf2df", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3883,13 +2893,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "06" + "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" }, { - "type": "UncategorizedText", - "element_id": "785329d8f1c63e8d0cdeedba9e6bc2ea", + "type": "ListItem", + "element_id": "1bbcee85386321e6e8235a64d4c34d73", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3901,13 +2911,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "10 10" + "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." }, { - "type": "UncategorizedText", - "element_id": "1e46bf7c5134da75e3a2aae852d7bddf", + "type": "ListItem", + "element_id": "4e2bc46d4988ddde43a4f295d1d458c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3919,13 +2929,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "12 12" + "text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." }, { - "type": "Title", - "element_id": "4255f2d53f6408c450b02b249d53c220", + "type": "ListItem", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3937,13 +2947,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "United States Euro area China Other AEs Other EMs" + "text": "" }, { - "type": "UncategorizedText", - "element_id": "c81a1234a265c680bbc9e96e73073acd", + "type": "ListItem", + "element_id": "2d14934d52ff357c52e9ae1c38f7390e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3955,13 +2965,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "14 16 14" + "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." }, { - "type": "UncategorizedText", - "element_id": "b17ef6d19c7a5b1ee83b907c595526dc", + "type": "ListItem", + "element_id": "4ce40bcfac131ab024e535bf860f9495", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3973,13 +2983,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "16" + "text": " Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." }, { - "type": "UncategorizedText", - "element_id": "99cb7a0185216a0acb0ed918e7058868", + "type": "ListItem", + "element_id": "75bd22ee0ba778cc3a616ed0a9b42292", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3991,13 +3001,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "18 18" + "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing" }, { - "type": "UncategorizedText", - "element_id": "0c5e98c11d7bb005adbaf731ebfbbb2c", + "type": "ListItem", + "element_id": "d1c38e022e1b399f4203ee41c6dc4e43", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4009,13 +3019,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "20 22 22" + "text": "pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." }, { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "type": "ListItem", + "element_id": "7250b07d7951c2b7b39c79195f4e69e7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4027,13 +3037,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 8 }, - "text": "20" + "text": "International Monetary Fund | January 2023. 7" }, { "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4045,13 +3055,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "October 2022 GFSR" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { "type": "NarrativeText", - "element_id": "e118be83abfed92b8969eca98bb4d53b", + "element_id": "6684fee3e3cd949ec59e7444a0c3fd0c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4063,13 +3073,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." + "text": "Fragmentation could intensify—with more restrictions on cross-border movements of capital, workers, and international payments—and could hamper multilateral cooperation on providing global public goods.1 The costs of such fragmentation are especially high in the short term, as replacing disrupted cross-border flows takes time." }, { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "type": "Title", + "element_id": "a81cc4e3ca23fd16254e2b858cdcb00a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4081,13 +3091,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "6" + "text": "Policy Priorities" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "NarrativeText", + "element_id": "1c464362698203e7245bdaf33c388a80", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4099,13 +3109,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "5" + "text": "Securing global disinflation: For most economies, the priority remains achieving a sustained reduction in inflation toward target levels. Raising real policy rates and keeping them above their neutral levels until underlying inflation is clearly declining would ward off risks of inflation expectations de- anchoring. Clear central bank communication and appropriate reactions to shifts in the data will help keep inflation expectations anchored and lessen wage and price pressures. Central banks’ balance sheets will need to be unwound carefully, amid market liquidity risks. Gradual and steady fiscal tightening would contribute to cooling demand and limit the burden on monetary policy in the fight against inflation. In countries where output remains below potential and inflation is in check, maintaining monetary and fiscal accommodation may be appropriate." }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "NarrativeText", + "element_id": "d6138134f71f953a9da2083154e2629e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4117,13 +3127,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "4" + "text": "Containing the reemergence of COVID-19: Addressing the ongoing pandemic requires coordinated efforts to boost vaccination and medicine access in countries where coverage remains low as well as the deployment of pandemic preparedness measures—including a global push toward sequencing and sharing data. In China, focusing vaccination efforts on vulnerable groups and maintaining sufficiently high coverage of boosters and antiviral medicines would minimize the risks of severe health outcomes and safeguard the recovery, with favorable cross-border spillovers." }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "NarrativeText", + "element_id": "2457fbbf5aa862b5a8b45d070f9114cb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4135,13 +3145,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "3" + "text": "Ensuring financial stability: Depending on country circumstances, macroprudential tools can be used to tackle pockets of elevated financial sector vulnerabilities. Monitoring housing sector developments and conducting stress tests in economies where house prices have increased significantly over the past few years are warranted. In China, central government action to resolve the property crisis and reduce the risk of spillovers to financial stability and growth is a priority, including by strengthening temporary mechanisms to protect presale homebuyers from the risk of non-delivery and by restructuring troubled developers. Globally, financial sector regulations introduced after the global financial crisis have contributed to the resilience of banking sectors throughout the pandemic, but there is a need to address data and supervisory gaps in the less-regulated nonbank financial sector, where risks may have built up inconspicuously. Recent turmoil in the crypto space also highlights the urgent need to introduce common standards and reinforce oversight of crypto assets." }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "NarrativeText", + "element_id": "bcef6ce9e3d4c015db21955dc4f6ce42", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4153,13 +3163,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "2" + "text": "Restoring debt sustainability: Lower growth and higher borrowing costs have raised public debt ratios in several economies. Where debt is unsustainable, implementing restructuring or reprofiling early on as part of a package of reforms (including fiscal consolidation and growth-enhancing supply-side reforms) can avert the need for more disruptive adjustment later." }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "NarrativeText", + "element_id": "defb87cb8f10236768732a1e5fe9519f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4171,13 +3181,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "1" + "text": "Supporting the vulnerable: The surge in global energy and food prices triggered a cost-of-living crisis. Governments acted swiftly with support to households and firms, which helped cushion effects on growth and at times limited the pass-through from energy prices to headline inflation through price" }, { - "type": "Title", - "element_id": "6ef230728534d871e5126e2a55e12b26", + "type": "NarrativeText", + "element_id": "bda037ffd6adfee8afa08544ca03a391", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4189,13 +3199,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "1 See “Geo-Economic Fragmentation and the Future of Multilateralism,” IMF Staff Discussion Note 2023/001." }, { - "type": "UncategorizedText", - "element_id": "3e48114b7946f4dd7a12ae0b2c1121af", + "type": "Footer", + "element_id": "a9811a5a7bebc1f7a97bf6ca7ca5c890", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4207,13 +3217,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 9 }, - "text": "© ——" + "text": "8 International Monetary Fund | January 2023" }, { - "type": "ListItem", - "element_id": "7d4f55875c970d850a152ba1d5ba02a5", + "type": "Title", + "element_id": "95af4f3feb2d03b2310ce31abc0c435d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4225,13 +3235,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "1. United States" + "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "Title", - "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", + "type": "NarrativeText", + "element_id": "2e9a0eaddd75095d1bbb4fda6f2c4feb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4243,13 +3253,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "Latest" + "text": "controls. The temporary and broad-based measures are becoming increasingly costly and should be withdrawn and replaced by targeted approaches. Preserving the energy price signal will encourage a reduction in energy consumption and limit the risks of shortages. Targeting can be achieved through social safety nets such as cash transfers to eligible households based on income or demographics or by transfers through electricity companies based on past energy consumption. Subsidies should be temporary and offset by revenue-generating measures, including one-time solidarity taxes on high- income households and companies, where appropriate." }, { - "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "type": "NarrativeText", + "element_id": "da0ef04b13917f67583290e9ba57e375", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4261,13 +3271,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "October 2022 GFSR" + "text": "Reinforcing supply: Supply-side policies could address the key structural factors impeding growth— including market power, rent seeking, rigid regulation and planning, and inefficient education—and could help build resilience, reduce bottlenecks, and alleviate price pressures. A concerted push for investment along the supply chain of green energy technologies would bolster energy security and help advance progress on the green transition." }, { - "type": "ListItem", - "element_id": "8e655408cf212df5f74df13e05cdf02c", + "type": "NarrativeText", + "element_id": "9b451c78081780087a0e1e67cc0eaa1d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4279,13 +3289,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "2. Euro area" + "text": "Strengthening multilateral cooperation—Urgent action is needed to limit the risks stemming from geopolitical fragmentation and to ensure cooperation on fundamental areas of common interest:" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "ListItem", + "element_id": "bd2ec14b604696a7f47651e97a351d31", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4297,13 +3307,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "5" + "text": "e = Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential." }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "NarrativeText", + "element_id": "add6f9f296b6a99cf0ef86162b3c9cfc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4315,13 +3325,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "4" + "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "ListItem", + "element_id": "e0ee0812ef9249e53d6425e299200f5c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4333,13 +3343,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "3" + "text": "e — Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "ListItem", + "element_id": "a5751b5964fbbc37b14db4811aeb37f4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4351,13 +3361,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "2" + "text": " Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "ListItem", + "element_id": "0a4c2d76937c64308220b20382ea68c6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4369,13 +3379,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "1" + "text": "e Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "ListItem", + "element_id": "cbb9553ae9412cc864f9f254b47c3efc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4387,13 +3397,13 @@ "date_modified": "2023-02-14T07:31:28" }, "filetype": "application/pdf", - "page_number": 11 + "page_number": 10 }, - "text": "Oct. 22" + "text": "International Monetary Fund | January 2023 9" }, { - "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "type": "Image", + "element_id": "0e1f5e74082ed333d383fa20680f0909", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4407,11 +3417,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "BOX 1. GLOBAL FINANCIAL STABILITY UPDATE" }, { - "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "type": "NarrativeText", + "element_id": "a2fa3a13e51ab7dd0859ee2c869b70e5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4425,11 +3435,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "Overall, financial stability risks remain elevated as investors reassess their inflation and monetary policy outlook. Global financial conditions have eased somewhat since the October 2022 Global Financial Stability Report, driven largely by changing market expectations regarding the interest rate cycle (Figure 1.1). While the expected peak in policy rates—the terminal rate—has risen, markets now also expect the subsequent fall in rates will be significantly faster, and further, than what was forecast in October (Figure 1.2). As a result, global bond yields have recently declined, corporate spreads have tightened, and equity markets have rebounded. That said, central banks are likely to continue to tighten monetary policy to fight inflation, and concerns that this restrictive stance could tip the economy into a recession have increased in major advanced economies." }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "NarrativeText", + "element_id": "f79a09409db68af141e82d9ac113ded8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4443,11 +3453,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "Figure 1.1. Global Financial Conditions: Selected Regions (Standard deviations from mean)" }, { - "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "type": "Image", + "element_id": "cdd008e3fd865bb8022a5facb083484d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4461,11 +3471,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": " 7 United States Qclober 6 Euro area 2022 : —— China GFSR — other AEs 4 other EMs 3 2 1 0 " }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "FigureCaption", + "element_id": "d78f392a386b26aa260548d71936abff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4479,11 +3489,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "Sources: Bloomberg Finance L.P.; Haver Analytics; national data sources; and IMF staff calculations. Note: AEs = advanced economies; EMs = emerging markets. GFSR = Global Financial Stability Report." }, { - "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "type": "NarrativeText", + "element_id": "e118be83abfed92b8969eca98bb4d53b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4497,11 +3507,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "element_id": "6ef230728534d871e5126e2a55e12b26", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4515,11 +3525,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "Image", + "element_id": "9a335b9a7fd0ccd069211c60419252fc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4533,11 +3543,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": " Latest © —— October 2022 GFSR 6 1. United States 2. Euro area 5 1 1 Oct. Apr. Oct. Dec. Dec. Oct. Apr. Oct. Dec. Dec. 22 23 23 24 26 22 2B 2B 24 2 " }, { - "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "type": "NarrativeText", + "element_id": "da431b9817da923cc48a538c4b3b8ade", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4551,11 +3561,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, { "type": "NarrativeText", - "element_id": "2dd1b91ebd6543b4902626a579552919", + "element_id": "d073e054fbe8931eb0e200b268710187", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4569,7 +3579,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." + "text": "Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report." }, { "type": "NarrativeText", @@ -4590,7 +3600,7 @@ "text": "Financial market volatility is expected to remain elevated and could be exacerbated by poor market liquidity. For some asset classes (such as US Treasuries), liquidity has deteriorated to the March 2020 lows of the COVID-19 pandemic. With the process of central bank balance sheet reduction (quantitative tightening) underway, market liquidity is expected to remain challenging." }, { - "type": "Title", + "type": "ListItem", "element_id": "bab943d841e99d44807adb96ef9ef925", "metadata": { "data_source": { @@ -4608,8 +3618,8 @@ "text": "10 — International Monetary Fund | January 2023" }, { - "type": "UncategorizedText", - "element_id": "09b3166aab28edac8872d46b3b34ab02", + "type": "NarrativeText", + "element_id": "06d12185958a014c0c9d6afeab7426c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index fc40f88495..095afab414 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "57eef8242d3675c93268fde018dc9df3", + "element_id": "14547603bad3329c14c74b8c4e2ff8d9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -15,11 +15,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "WORLD NUCLEAR //s88ciation" + "text": "//s88ciation" }, { "type": "Title", - "element_id": "9f8388cf868cb29d273fdd7328642ff8", + "element_id": "80f1cd7f1c8e281093a32842b1e5bbce", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -33,11 +33,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "The Silent Giant" + "text": "WORLD NUCLEAR" }, { "type": "Title", - "element_id": "f439367da08e61523302e29f153007e0", + "element_id": "51174df4a3a78fe261885b1818b66876", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -51,11 +51,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "The need for nuclear in a clean energy system" + "text": "The Silent Giant" }, { - "type": "Title", - "element_id": "53d548aa01fc3eb72da15a5be7f235e2", + "type": "NarrativeText", + "element_id": "e2b1006b190b699d597fdb0f1d73f8f9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -67,13 +67,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 3 + "page_number": 1 }, - "text": "Executive Summary" + "text": "The need for nuclear in a clean energy system" }, { - "type": "NarrativeText", - "element_id": "1f4925fb064910ee923ccc1f6b20715b", + "type": "Title", + "element_id": "2fa985d0a50e61b09ec22c447cc4b2c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -87,11 +87,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "In a world centred on short-term fixes, many of the traits that make nuclear energy a key player in the transition to a sustainable world are not properly valued and often taken for granted. Reflecting on the popular discourse in the world of energy politics it would seem that renewables, and renewables alone, will be responsible for, and capable of, delivering a zero-carbon energy system – and that it is just a matter of time." + "text": "Executive Summary" }, { "type": "NarrativeText", - "element_id": "46385c950e7da4d8e588686a541335c2", + "element_id": "1f4925fb064910ee923ccc1f6b20715b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -105,7 +105,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The reality today is that both global carbon dioxide emissions and fossil fuel use are still on the rise. This does not only make the battle against climate change much harder, but also results in hundreds of thousands of pollution deaths every year." + "text": "In a world centred on short-term fixes, many of the traits that make nuclear energy a key player in the transition to a sustainable world are not properly valued and often taken for granted. Reflecting on the popular discourse in the world of energy politics it would seem that renewables, and renewables alone, will be responsible for, and capable of, delivering a zero-carbon energy system – and that it is just a matter of time." }, { "type": "NarrativeText", @@ -127,7 +127,7 @@ }, { "type": "NarrativeText", - "element_id": "ae77460bce2d3a52d823954ccb9c708f", + "element_id": "46385c950e7da4d8e588686a541335c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -141,11 +141,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Energy is the essential agent for promoting human development, and global demand is projected to increase significantly in the coming decades. Securing access to modern and affordable energy is essential for lifting people out of poverty, and for promoting energy independence and economic growth." + "text": "The reality today is that both global carbon dioxide emissions and fossil fuel use are still on the rise. This does not only make the battle against climate change much harder, but also results in hundreds of thousands of pollution deaths every year." }, { "type": "NarrativeText", - "element_id": "c6d2fa859e6df9845dee4044d05ddbc5", + "element_id": "ae77460bce2d3a52d823954ccb9c708f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -159,11 +159,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Nuclear energy has shown – be it in France or Sweden – that it has the potential to be the catalyst for delivering sustainable energy transitions, long before climate change was on the agenda. The use of nuclear energy is the fast track to a high-powered and clean energy system, which not only delivers a healthier environment and an affordable supply of electricity, but also strengthens energy security and helps mitigate climate change." + "text": "Energy is the essential agent for promoting human development, and global demand is projected to increase significantly in the coming decades. Securing access to modern and affordable energy is essential for lifting people out of poverty, and for promoting energy independence and economic growth." }, { "type": "NarrativeText", - "element_id": "e055395659c9e1aa4d5c0afb188e4a9e", + "element_id": "c6d2fa859e6df9845dee4044d05ddbc5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -177,11 +177,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The global nuclear industry, led by World Nuclear Association, is ready to take on the challenge. As part of the Harmony Programme, we have set a target to build an additional 1000GWe of reactors across the world before 2050, bringing the global share of electricity production of nuclear to 25%." + "text": "Nuclear energy has shown – be it in France or Sweden – that it has the potential to be the catalyst for delivering sustainable energy transitions, long before climate change was on the agenda. The use of nuclear energy is the fast track to a high-powered and clean energy system, which not only delivers a healthier environment and an affordable supply of electricity, but also strengthens energy security and helps mitigate climate change." }, { "type": "NarrativeText", - "element_id": "33a2aba13d6b228d8d6f792f16caa684", + "element_id": "e055395659c9e1aa4d5c0afb188e4a9e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -195,11 +195,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "In order to realise the full potential of nuclear energy we have identified three key areas where actions are required:" + "text": "The global nuclear industry, led by World Nuclear Association, is ready to take on the challenge. As part of the Harmony Programme, we have set a target to build an additional 1000GWe of reactors across the world before 2050, bringing the global share of electricity production of nuclear to 25%." }, { - "type": "ListItem", - "element_id": "e18242a460d9d495ea7cffee38c1e647", + "type": "NarrativeText", + "element_id": "33a2aba13d6b228d8d6f792f16caa684", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -213,11 +213,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "° The need to create a level playing field that values reliability and energy security ° The need for harmony in the nuclear regulatory environment ° The need for a holistic safety paradigm for the whole electricity system." + "text": "In order to realise the full potential of nuclear energy we have identified three key areas where actions are required:" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "ListItem", + "element_id": "9209d9a3c8ea19bed487dff9476428ee", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -231,11 +231,11 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "1" + "text": "• The need to create a level playing field that values reliability and energy security" }, { - "type": "Title", - "element_id": "257fa04b9d79fc46da551d720411595a", + "type": "ListItem", + "element_id": "ae74ee3ddcecd2ffb75672d469c80a0e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -247,13 +247,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 3 }, - "text": "The drivers for a clean energy system" + "text": "• The need for harmony in the nuclear regulatory environment" }, { - "type": "NarrativeText", - "element_id": "ca18f74506ddc1bca89179259f3ff4cb", + "type": "ListItem", + "element_id": "93e7dedc9d334470067ad2de1f9ee788", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -265,13 +265,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 3 }, - "text": "Electricity is central to modern life – it powers our daily lives, as well as our dreams and ambitions. Demand has grown steadily for more than 100 years, and will continue to do so as many parts of the world continue to develop, and electrification takes a central role in efforts to decarbonize (Figure 1). With nearly a billion people around the world still living in the dark, without access to electricity, humanity has a responsibility to learn from the past - everyone has the right to enjoy a modern lifestyle in a way that does not cause harm to people or the planet." + "text": "The need for a holistic safety paradigm for the whole electricity system." }, { - "type": "UncategorizedText", - "element_id": "b4af08fb653ae7dea99f3a48c2ff7f5d", + "type": "Footer", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -283,13 +283,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 3 }, - "text": "45,000" + "text": "1" }, { "type": "Title", - "element_id": "e29786b8cc565a047639f24f7171c30f", + "element_id": "257fa04b9d79fc46da551d720411595a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -303,11 +303,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Marine" + "text": "The drivers for a clean energy system" }, { - "type": "UncategorizedText", - "element_id": "9925953f1faef050547e5f7b811c3f7d", + "type": "NarrativeText", + "element_id": "ca18f74506ddc1bca89179259f3ff4cb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -321,11 +321,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "40,000" + "text": "Electricity is central to modern life – it powers our daily lives, as well as our dreams and ambitions. Demand has grown steadily for more than 100 years, and will continue to do so as many parts of the world continue to develop, and electrification takes a central role in efforts to decarbonize (Figure 1). With nearly a billion people around the world still living in the dark, without access to electricity, humanity has a responsibility to learn from the past - everyone has the right to enjoy a modern lifestyle in a way that does not cause harm to people or the planet." }, { - "type": "Title", - "element_id": "d04999bf99ea28fc8a6b20318caac58c", + "type": "UncategorizedText", + "element_id": "b4af08fb653ae7dea99f3a48c2ff7f5d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -339,7 +339,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " CSP" + "text": "45,000" }, { "type": "Title", @@ -360,8 +360,8 @@ "text": "h W T" }, { - "type": "UncategorizedText", - "element_id": "4ebe55cc1aee6dd892d7182d797d105a", + "type": "Image", + "element_id": "d5aedf7912dfff3c661af8cd17426bac", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -375,11 +375,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "35,000" + "text": "45,000 © Marine 40,000 M™@ csp 35,000 zz Solar PV Geothermal 30,000 ~ Mi Wind 25,000 — Il Bioenergy 20,000 = BB Hydro Nuclear 15,000 — Gas 10,000 — oi 5,000 __ Coal 2000 2010 2020 2030 2040" }, { "type": "UncategorizedText", - "element_id": "422f240e43a3226f329ba4a0236f587c", + "element_id": "81a83544cf93c245178cbc1620030f11", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -393,11 +393,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000" + "text": "2000" }, { "type": "UncategorizedText", - "element_id": "c7e6673590d2426f635c9be70bd8f057", + "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -411,11 +411,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "25,000" + "text": "2010" }, { "type": "UncategorizedText", - "element_id": "b6b53b7d4224992f9aa86411bbc3f74b", + "element_id": "73a2af8864fc500fa49048bf3003776c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -429,11 +429,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "20,000" + "text": "2020" }, { "type": "UncategorizedText", - "element_id": "b2ee3509c1fa4f9741f894e592bda9ac", + "element_id": "8e1f192fe25ad49be764c3f55c68beb3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -447,11 +447,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "15,000" + "text": "2030" }, { "type": "UncategorizedText", - "element_id": "28ec039832f5bc96c2be0eaee016dafe", + "element_id": "df34d853f2f2f1f14b92359f695426dc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -465,11 +465,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "10,000" + "text": "2040" }, { - "type": "UncategorizedText", - "element_id": "b2008c37ee3a7cf7ba87f5ad50dd9e11", + "type": "FigureCaption", + "element_id": "578e73d091a9463a76ea7502a6a92503", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -483,11 +483,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "5,000" + "text": "Figure 1. IEA projected electricity production and sources to 2040 i" }, { - "type": "Title", - "element_id": "4a60bf7d4bc1e485744cf7e8d0860524", + "type": "NarrativeText", + "element_id": "427b54db6e4b434f92954bc67db93473", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -501,11 +501,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "zz" + "text": "The challenge before us, however, goes far beyond just electricity – we will need to find ways to decarbonize all parts of the economy, and we need solutions that are sustainable in the long-term. That means changing the way we heat our homes and power our industrial processes, as well as ensuring that the way we travel, export our products and ship our food moves away from fossil fuels." }, { - "type": "UncategorizedText", - "element_id": "7ace431cb61584cb9b8dc7ec08cf38ac", + "type": "NarrativeText", + "element_id": "92f6fd6a561b87154049d083b93b611d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -519,11 +519,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "~" + "text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear – instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall." }, { - "type": "UncategorizedText", - "element_id": "bda050585a00f0f6cb502350559d7553", + "type": "Title", + "element_id": "a5d60fc4dbbd484074d8389c35703cf7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -537,11 +537,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "—" + "text": "h W G" }, { - "type": "UncategorizedText", - "element_id": "380918b946a526640a40df5dced65167", + "type": "Image", + "element_id": "81fe4504e383e98273c4a560382d82ee", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -555,11 +555,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "=" + "text": "30,000,000 |_| High-carbon HE Low-carbon 25,000,000 20,000,000 15,000,000 10,000,000 5,000,000 1990 1995 2000 2005 2010 2015" }, { "type": "UncategorizedText", - "element_id": "bda050585a00f0f6cb502350559d7553", + "element_id": "a7be8e1fe282a37cd666e0632b17d933", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -573,11 +573,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "—" + "text": "1990" }, { "type": "UncategorizedText", - "element_id": "bda050585a00f0f6cb502350559d7553", + "element_id": "e78f27ab3ef177a9926e6b90e572b985", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -591,65 +591,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "—" + "text": "1995" }, { "type": "UncategorizedText", - "element_id": "9911f4d2b18457c4726664d309385072", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "__" - }, - { - "type": "Title", - "element_id": "8af26217282646d0f64d3e3211f47512", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": " Solar PV" - }, - { - "type": "Title", - "element_id": "6e28663850f2b50ee6af2d4477b410be", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": " Geothermal" - }, - { - "type": "Title", - "element_id": "7e2f430d44cfb03dca12ffde615c36ec", + "element_id": "81a83544cf93c245178cbc1620030f11", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -663,11 +609,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Wind" + "text": "2000" }, { - "type": "Title", - "element_id": "bde9df80639b681edb85ace46b4d4600", + "type": "UncategorizedText", + "element_id": "a20a2b7bb0842d5cf8a0c06c626421fd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -681,11 +627,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Bioenergy" + "text": "2005" }, { - "type": "Title", - "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", + "type": "UncategorizedText", + "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -699,11 +645,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Hydro" + "text": "2010" }, { - "type": "Title", - "element_id": "f35457739b3bd74c61625c986c844726", + "type": "UncategorizedText", + "element_id": "a85e9db4851f7cd3efb8db7bf69a07cf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -717,11 +663,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Nuclear" + "text": "2015" }, { - "type": "Title", - "element_id": "0f3341ae76e0d4d7816d3620bd915110", + "type": "FigureCaption", + "element_id": "aa04bda99d06997f39a4b613c2c62be5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -735,11 +681,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Gas" + "text": "Figure 2. Worldwide electricity generation by fuel (1990-2016)ii" }, { - "type": "Title", - "element_id": "b001a2374d44e3085e712bb40f66270e", + "type": "Header", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -753,11 +699,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Oil" + "text": "2" }, { - "type": "Title", - "element_id": "90ad0c8c14253135efd14645e0156145", + "type": "NarrativeText", + "element_id": "d841776bdfaae69274a3c8b898021653", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -769,13 +715,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": " Coal" + "text": "We need to deliver a worldwide transformation that is socially, economically and environmentally sustainable. We need a system that is affordable – no one should have to choose between heating their home, and essentials like eating – as well as helping to alleviate poverty, and ensure the realization of human potential globally. We need a power source that can not only help us mitigate the effects of climate change and environmental degradation, but can also help bring the enormous benefits of reliable electricity supply to the corners of the world that do not have access to it." }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "NarrativeText", + "element_id": "10a72512425bbe7a4cdd6529b0337d90", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -787,13 +733,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": "0" + "text": "Nuclear energy is already making a major contribution. By using nuclear energy rather than fossil fuels, we currently avoid the emission of more than 2500 million tonnes of carbon dioxide every year. To put that into perspective, it is the equivalent of removing about 400 million cars from the world’s roads." }, { - "type": "UncategorizedText", - "element_id": "81a83544cf93c245178cbc1620030f11", + "type": "NarrativeText", + "element_id": "030d3154a592248139651c5f8fbef1d5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -805,13 +751,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": "2000" + "text": "Modern society is dependent on the steady supply of electricity, every day of the year – regardless of weather, season or time of day – and nuclear energy is particularly well-suited to providing this service. Given that the majority of baseload supply is fossil-based, an increase in the use of nuclear energy would result in a rapid decarbonization of the electricity system. The International Energy Agency’s (IEA) recent report III on nuclear energy highlighted the importance of dependable baseload electricity generators and the need to properly value and compensate them for the electricity security and reliability services they provide." }, { - "type": "UncategorizedText", - "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", + "type": "Footer", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -823,13 +769,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 5 }, - "text": "2010" + "text": "3" }, { - "type": "UncategorizedText", - "element_id": "73a2af8864fc500fa49048bf3003776c", + "type": "NarrativeText", + "element_id": "a53cecd93ffb9ec731b7974f1805e924", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -841,13 +787,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "2020" + "text": "Despite impressive recent growth, the stark reality is that renewables alone will not be able to resolve our dependence on fossil fuels. Clearly, the sun does not always shine, and the wind does not always blow, and this is compounded by the fact that many times these periods coincide with when electricity demand is at its highest, but renewables can be complementary to nuclear energy. Storage solutions, such as batteries, will not be able to power our societies for days or weeks when the weather is not favourable. Natural gas is currently the most used solution for the intermittency problem, which only serves to reinforce our economy’s dependence of fossil fuels, and severely undermines the apparently ‘green credentials’ of many renewables." }, { - "type": "UncategorizedText", - "element_id": "8e1f192fe25ad49be764c3f55c68beb3", + "type": "Title", + "element_id": "899a2c517ba69726f3808d66f442e439", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -859,13 +805,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "2030" + "text": "Moving to a sustainable future" }, { - "type": "UncategorizedText", - "element_id": "df34d853f2f2f1f14b92359f695426dc", + "type": "NarrativeText", + "element_id": "a8c17b6aa3cad915f2f7e0126706c2f5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -877,13 +823,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "2040" + "text": "The Intergovernmental Panel on Climate Change (IPCC) special report on Global Warming of 1.5°C iv examined a large number of different scenarios for limiting global warming to 1.5°C. Of those scenarios which would achieve the 1.5°C target, the mean increase in nuclear energy’s contribution to electricity production was 2.5 times higher compared to today. However, the ‘middle-of-the-road’ scenario – in which social, economic, and technological trends follow current patterns and would not require major changes to, for example, diet and travel habits – sees the need for nuclear increase by five times globally by 2050." }, { "type": "NarrativeText", - "element_id": "578e73d091a9463a76ea7502a6a92503", + "element_id": "7562e707e991f1fb634fff41f2cae0e4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -895,13 +841,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "Figure 1. IEA projected electricity production and sources to 2040 i" + "text": "The IEA has concluded that without an expanded contribution from nuclear energy, the already huge challenge of achieving emissions reductions will become drastically harder and more costly. In their latest report on nuclear energy v, published in 2019, they also conclude that not using nuclear would have negative implications for energy security and result in higher costs for the consumers. The IEA recommends policy reforms to ‘… ensure competition on a level playing field’ and that the ‘… focus should be on designing electricity markets in a way that values the clean energy and energy security attributes of low-carbon technologies, including nuclear power.’ Such reforms should also ensure that reliability of electricity production is properly valued and compensated." }, { "type": "NarrativeText", - "element_id": "427b54db6e4b434f92954bc67db93473", + "element_id": "1cde21cc10aa769a17ca11aa1e10823e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -913,13 +859,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "The challenge before us, however, goes far beyond just electricity – we will need to find ways to decarbonize all parts of the economy, and we need solutions that are sustainable in the long-term. That means changing the way we heat our homes and power our industrial processes, as well as ensuring that the way we travel, export our products and ship our food moves away from fossil fuels." + "text": "As part of the Harmony Programme, the world’s nuclear industry has identified three key policy areas for action to unlock the true potential of nuclear energy - the need for a level playing field, the harmonization of regulations and the establishment of an effective safety paradigm." }, { "type": "NarrativeText", - "element_id": "92f6fd6a561b87154049d083b93b611d", + "element_id": "af2424b7ec665072a2ee0bdcd901e244", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -931,13 +877,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "Despite the very considerable efforts to decarbonize the economy and the countless billions spent, our world remains heavily addicted to fossil fuels. The trend is clear – instead of reducing our dependence on fossil fuels, we are increasing it (Figure 2). As a direct result, greenhouse gas emissions continue to rise when they need to drastically fall." + "text": "In regard to the need for a level playing field, we see that many of the world’s electricity markets operate in an unsustainable fashion, dominated by short-term thinking. Electricity supply which is affordable, reliable and available 24/7 generates broad societal benefits, and as seen in Figure 3, nuclear is one of the most affordable electricity sources." }, { - "type": "UncategorizedText", - "element_id": "ebc18f485dc347b842b3d248d011ce6c", + "type": "Footer", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -949,1669 +895,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 4 + "page_number": 6 }, - "text": "30,000,000" + "text": "4" }, { "type": "Title", - "element_id": "a5d60fc4dbbd484074d8389c35703cf7", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "h W G" - }, - { - "type": "UncategorizedText", - "element_id": "dcdc1a65c75197a553fdd90554060414", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "25,000,000" - }, - { - "type": "UncategorizedText", - "element_id": "1476fd07ef61145d484f5a2e0b4e8e7d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "20,000,000" - }, - { - "type": "UncategorizedText", - "element_id": "a63634f2c80c7bcc81bc6faad5d53e16", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "15,000,000" - }, - { - "type": "UncategorizedText", - "element_id": "8582d26affb6928525e4f027c2cb8c08", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "10,000,000" - }, - { - "type": "UncategorizedText", - "element_id": "265e4d619f6b21971816b0e4274faf92", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "5,000,000" - }, - { - "type": "Title", - "element_id": "e3cf3e34001852adb7a17cf424bda9fc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": " High-carbon  Low-carbon" - }, - { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "0" - }, - { - "type": "UncategorizedText", - "element_id": "a7be8e1fe282a37cd666e0632b17d933", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "1990" - }, - { - "type": "UncategorizedText", - "element_id": "e78f27ab3ef177a9926e6b90e572b985", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "1995" - }, - { - "type": "UncategorizedText", - "element_id": "81a83544cf93c245178cbc1620030f11", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2000" - }, - { - "type": "UncategorizedText", - "element_id": "a20a2b7bb0842d5cf8a0c06c626421fd", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2005" - }, - { - "type": "UncategorizedText", - "element_id": "7d12ba56e9f8b3dc64f77c87318c4f37", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2010" - }, - { - "type": "UncategorizedText", - "element_id": "a85e9db4851f7cd3efb8db7bf69a07cf", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2015" - }, - { - "type": "NarrativeText", - "element_id": "aa04bda99d06997f39a4b613c2c62be5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "Figure 2. Worldwide electricity generation by fuel (1990-2016)ii" - }, - { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "2" - }, - { - "type": "NarrativeText", - "element_id": "d841776bdfaae69274a3c8b898021653", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "We need to deliver a worldwide transformation that is socially, economically and environmentally sustainable. We need a system that is affordable – no one should have to choose between heating their home, and essentials like eating – as well as helping to alleviate poverty, and ensure the realization of human potential globally. We need a power source that can not only help us mitigate the effects of climate change and environmental degradation, but can also help bring the enormous benefits of reliable electricity supply to the corners of the world that do not have access to it." - }, - { - "type": "NarrativeText", - "element_id": "10a72512425bbe7a4cdd6529b0337d90", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Nuclear energy is already making a major contribution. By using nuclear energy rather than fossil fuels, we currently avoid the emission of more than 2500 million tonnes of carbon dioxide every year. To put that into perspective, it is the equivalent of removing about 400 million cars from the world’s roads." - }, - { - "type": "NarrativeText", - "element_id": "030d3154a592248139651c5f8fbef1d5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Modern society is dependent on the steady supply of electricity, every day of the year – regardless of weather, season or time of day – and nuclear energy is particularly well-suited to providing this service. Given that the majority of baseload supply is fossil-based, an increase in the use of nuclear energy would result in a rapid decarbonization of the electricity system. The International Energy Agency’s (IEA) recent report III on nuclear energy highlighted the importance of dependable baseload electricity generators and the need to properly value and compensate them for the electricity security and reliability services they provide." - }, - { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "3" - }, - { - "type": "NarrativeText", - "element_id": "a53cecd93ffb9ec731b7974f1805e924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Despite impressive recent growth, the stark reality is that renewables alone will not be able to resolve our dependence on fossil fuels. Clearly, the sun does not always shine, and the wind does not always blow, and this is compounded by the fact that many times these periods coincide with when electricity demand is at its highest, but renewables can be complementary to nuclear energy. Storage solutions, such as batteries, will not be able to power our societies for days or weeks when the weather is not favourable. Natural gas is currently the most used solution for the intermittency problem, which only serves to reinforce our economy’s dependence of fossil fuels, and severely undermines the apparently ‘green credentials’ of many renewables." - }, - { - "type": "Title", - "element_id": "899a2c517ba69726f3808d66f442e439", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Moving to a sustainable future" - }, - { - "type": "NarrativeText", - "element_id": "a8c17b6aa3cad915f2f7e0126706c2f5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "The Intergovernmental Panel on Climate Change (IPCC) special report on Global Warming of 1.5°C iv examined a large number of different scenarios for limiting global warming to 1.5°C. Of those scenarios which would achieve the 1.5°C target, the mean increase in nuclear energy’s contribution to electricity production was 2.5 times higher compared to today. However, the ‘middle-of-the-road’ scenario – in which social, economic, and technological trends follow current patterns and would not require major changes to, for example, diet and travel habits – sees the need for nuclear increase by five times globally by 2050." - }, - { - "type": "NarrativeText", - "element_id": "7562e707e991f1fb634fff41f2cae0e4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "The IEA has concluded that without an expanded contribution from nuclear energy, the already huge challenge of achieving emissions reductions will become drastically harder and more costly. In their latest report on nuclear energy v, published in 2019, they also conclude that not using nuclear would have negative implications for energy security and result in higher costs for the consumers. The IEA recommends policy reforms to ‘… ensure competition on a level playing field’ and that the ‘… focus should be on designing electricity markets in a way that values the clean energy and energy security attributes of low-carbon technologies, including nuclear power.’ Such reforms should also ensure that reliability of electricity production is properly valued and compensated." - }, - { - "type": "NarrativeText", - "element_id": "1cde21cc10aa769a17ca11aa1e10823e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "As part of the Harmony Programme, the world’s nuclear industry has identified three key policy areas for action to unlock the true potential of nuclear energy - the need for a level playing field, the harmonization of regulations and the establishment of an effective safety paradigm." - }, - { - "type": "NarrativeText", - "element_id": "af2424b7ec665072a2ee0bdcd901e244", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "In regard to the need for a level playing field, we see that many of the world’s electricity markets operate in an unsustainable fashion, dominated by short-term thinking. Electricity supply which is affordable, reliable and available 24/7 generates broad societal benefits, and as seen in Figure 3, nuclear is one of the most affordable electricity sources." - }, - { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "4" - }, - { - "type": "UncategorizedText", - "element_id": "983bd614bb5afece5ab3b6023f71147c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "300" - }, - { - "type": "UncategorizedText", - "element_id": "1e472b39b105d349bcd069c4a711b44a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "250" - }, - { - "type": "UncategorizedText", - "element_id": "27badc983df1780b60c2b3fa9d3a19a0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "200" - }, - { - "type": "Title", - "element_id": "e8dbac2cdc67e714e99baa9c0f6a54b9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "h W M / $" - }, - { - "type": "UncategorizedText", - "element_id": "9ae2bdd7beedc2e766c6b76585530e16", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "150" - }, - { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "100" - }, - { - "type": "UncategorizedText", - "element_id": "1a6562590ef19d1045d06c4055742d38", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "50" - }, - { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "0" - }, - { - "type": "NarrativeText", - "element_id": "4b5ebf5890b9c61b43c5daf4c40cbab0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "C o m" - }, - { - "type": "Title", - "element_id": "8fd5206adbbb7a132889e4161057d4cf", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "m ercial Photovoltaic" - }, - { - "type": "Title", - "element_id": "8e2f99a9826b1b316f7690290f32b31f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "O nshore Wind" - }, - { - "type": "Title", - "element_id": "53209d7cc67427ba22ec6d878fc8d421", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Offshore Wind" - }, - { - "type": "Title", - "element_id": "0e6fac6a3ad129a64c2b9d6eaf6680e4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "SS" - }, - { - "type": "Title", - "element_id": "6dc76d1e1c35d4253537250288157d0c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "N uclear" - }, - { - "type": "Title", - "element_id": "079c085d3cb9f52f2392addf619382be", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "C C G T" - }, - { - "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "C oal" - }, - { - "type": "NarrativeText", - "element_id": "a5846cd18e790db780cc03f9e5f63278", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Figure 3. Comparative cost projections for main electricity generators vi" - }, - { - "type": "NarrativeText", - "element_id": "9ad4cf48d0b9d0bbfd257214f3d050dd", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "However, markets fail to give due credit to electricity generators, such as nuclear energy, that are able to meet these societal demands. This has resulted in situations where nuclear energy has struggled to compete with energy sources that have been subsidized, do not pay the hidden costs brought on by their intermittency (e.g. costly backup provisions and investments in the grid), or do not have to take responsibility for using our common atmosphere as a dumping ground." - }, - { - "type": "NarrativeText", - "element_id": "4b3dad9b769c100e89b2c082e7d9e13e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation." - }, - { - "type": "NarrativeText", - "element_id": "13ff2375260e277c2dfbc8826aa50a65", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life." - }, - { - "type": "NarrativeText", - "element_id": "0ce74aa5e786157de72d5ae801d86cc4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "The International Atomic Energy Agency (IAEA) has highlighted the importance of addressing this issue, concluding that the lack of regulatory harmony ‘…causes many drawbacks for the entire nuclear industry, including developers, vendors, operators and even regulators themselves…This results in increased costs and reduced predictability in project execution’. vii It is therefore crucial that we harmonize the regulatory process to address these weaknesses, and avoid unnecessary duplication and inconsistencies." - }, - { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "5" - }, - { - "type": "NarrativeText", - "element_id": "2cf9c478a20b21f5792941a179d926e9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "In regard to the need for a holistic safety paradigm for the whole electricity system, we need to consider safety from a societal perspective, something the current energy system fails to do. The health, environmental and safety benefits of nuclear energy are not sufficiently understood and valued when compared with other electricity sources. Nuclear energy remains the safest form of electricity generation (Figure 4). Additionally, the use of nuclear consistently prevents many tens of thousands of deaths (mainly resulting from air pollution) every year by avoiding the use of coal - lifesaving measures which must be better recognised and valued." - }, - { - "type": "UncategorizedText", - "element_id": "dbae772db29058a88f9bd830e957c695", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "140" - }, - { - "type": "NarrativeText", - "element_id": "e11247712b3df61756970b45f019ad68", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "r a e y" - }, - { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "e" - }, - { - "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "120" - }, - { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "100" - }, - { - "type": "UncategorizedText", - "element_id": "5bddd069fd77ec5699d9ab00c00f47c4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "1 :" - }, - { - "type": "UncategorizedText", - "element_id": "2abaca4911e68fa9bfbf3482ee797fd5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "120" - }, - { - "type": "UncategorizedText", - "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": ":" - }, - { - "type": "UncategorizedText", - "element_id": "b725d20650649a5221675144bab5946e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "99.5" - }, - { - "type": "Title", - "element_id": "f83714d89302473e0e4f5399bd50e7a9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "W T" - }, - { - "type": "UncategorizedText", - "element_id": "380918b946a526640a40df5dced65167", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "=" - }, - { - "type": "NarrativeText", - "element_id": "f9bb49945b60897227abdd75b5f8d39b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "r e p s e i t i l" - }, - { - "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "80" - }, - { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "60" - }, - { - "type": "UncategorizedText", - "element_id": "ce3201efc2e495241a85e4fc84575f50", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "71.9" - }, - { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "1" - }, - { - "type": "Title", - "element_id": "1b16b1df538ba12dc3f97edbb85caa70", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "n" - }, - { - "type": "UncategorizedText", - "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "." - }, - { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "1" - }, - { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "a t a F" - }, - { - "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "40" - }, - { - "type": "UncategorizedText", - "element_id": "e7ac0786668e0ff0f02b62bd04f45ff6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": ":" - }, - { - "type": "UncategorizedText", - "element_id": "911bc18af1665a604b4fa4a97d47f477", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "“99 :" - }, - { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "20" - }, - { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "0" - }, - { - "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "C oal" - }, - { - "type": "Title", - "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Oil" - }, - { - "type": "Title", - "element_id": "4fabb98454d019811a732c4a09f31bf0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "N atural gas" - }, - { - "type": "Title", - "element_id": "593cbe414f10662e62c0da03ce3302b8", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "fe)" - }, - { - "type": "Title", - "element_id": "77cf83b127020f3a465005abc747e63f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Offshore wind" - }, - { - "type": "UncategorizedText", - "element_id": "77e43ef38dbfcec0511535d9c7dbee5c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "(U K)" - }, - { - "type": "UncategorizedText", - "element_id": "cc6f2aa507f6a1f7de2db7e09ddef042", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "8.5" - }, - { - "type": "NarrativeText", - "element_id": "50a78acc78a3c5b4acc8c439af743a0a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "O nshore wind (G erm any)" - }, - { - "type": "UncategorizedText", - "element_id": "5d48c7c6dce082d397fecf99b8f1ac7f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "1.78" - }, - { - "type": "Title", - "element_id": "bbf2011ddebee240452a3ab98416afb4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "S olar P V" - }, - { - "type": "UncategorizedText", - "element_id": "f1ced6d8a7d437fd3748f56bb2358f9a", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "0.245" - }, - { - "type": "Title", - "element_id": "f280c2a253ebd5a7389dd0790fcbd56c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "N uclear*" - }, - { - "type": "UncategorizedText", - "element_id": "efc293f64a092b9bfe153be9357f9580", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "<0.01" - }, - { - "type": "NarrativeText", - "element_id": "445676822969fb5177c0081d07449a70", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Figure 4. Comparison of number of fatalities due to electricity generation viii" - }, - { - "type": "Title", - "element_id": "98d83a387e3ac2261daaf8d936bf3e27", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Nuclear for a sustainable tomorrow" - }, - { - "type": "NarrativeText", - "element_id": "1119369ba9a68924c64155762de72d8e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Nuclear energy is already making a significant contribution to providing the world with clean and abundant electricity, and has a proven track record of being a reliable workhorse around the world. Countries like France, Sweden and Switzerland have proven that it is possible to divorce economic growth from an increase in damaging emissions and over the timescales required to effectively challenge climate change and environmental degradation (Figures 5 and 6). Nuclear can ensure that fast-growing populations achieve rising standards of living – without having to sacrifice the planet or their own well-being." - }, - { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "100" - }, - { - "type": "Title", - "element_id": "90ad0c8c14253135efd14645e0156145", + "element_id": "e8dbac2cdc67e714e99baa9c0f6a54b9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2623,13 +913,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": " Coal" + "text": "h W M / $" }, { - "type": "UncategorizedText", - "element_id": "69f59c273b6e669ac32a6dd5e1b2cb63", + "type": "Image", + "element_id": "5b5f659ab2c445e9ed688dd79280a53e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2641,13 +931,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": "90" + "text": " a ro) 0 » ec $ Se SW SS is é e » Pe US X? oe fe)" }, { - "type": "Title", - "element_id": "3fd264839410c464bf2640d6dbf3ed86", + "type": "FigureCaption", + "element_id": "a5846cd18e790db780cc03f9e5f63278", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2659,13 +949,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": " Gas/Oil" + "text": "Figure 3. Comparative cost projections for main electricity generators vi" }, { - "type": "UncategorizedText", - "element_id": "48449a14a4ff7d79bb7a1b6f3d488eba", + "type": "NarrativeText", + "element_id": "13ff2375260e277c2dfbc8826aa50a65", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2677,13 +967,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": "80" + "text": "Additionally, electricity markets fail to recognize the relative costs of different forms of electricity generation. Whilst the nuclear industry takes responsibility for its lifecycle costs (including decommissioning and waste management), other electricity generators do not. Fossil fuel generators are rarely required to pay the price in line with the environmental and health damage that their emissions cause, whilst the cost of wind and solar does not include the disposal of the sometimes toxic materials at the end of their life." }, { - "type": "Title", - "element_id": "9a1f49cd39fe9698fc556924b6b889da", + "type": "NarrativeText", + "element_id": "9ad4cf48d0b9d0bbfd257214f3d050dd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2695,13 +985,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": " Biofuels/Waste" + "text": "However, markets fail to give due credit to electricity generators, such as nuclear energy, that are able to meet these societal demands. This has resulted in situations where nuclear energy has struggled to compete with energy sources that have been subsidized, do not pay the hidden costs brought on by their intermittency (e.g. costly backup provisions and investments in the grid), or do not have to take responsibility for using our common atmosphere as a dumping ground." }, { - "type": "UncategorizedText", - "element_id": "ff5a1ae012afa5d4c889c50ad427aaf5", + "type": "NarrativeText", + "element_id": "4b3dad9b769c100e89b2c082e7d9e13e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2713,13 +1003,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": "70" + "text": "In regard to the need to harmonize regulations, multiple regulatory barriers stemming from diverse national licensing processes and safety requirements currently limit global nuclear trade and investment. A lack of international standardization places unnecessary regulatory burdens on nuclear activities and causes delays in the licensing of new designs, hindering innovation." }, { - "type": "Title", - "element_id": "c4fad0ce9772d241be8c8624896ada86", + "type": "NarrativeText", + "element_id": "0ce74aa5e786157de72d5ae801d86cc4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2731,13 +1021,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": " Wind/Solar" + "text": "The International Atomic Energy Agency (IAEA) has highlighted the importance of addressing this issue, concluding that the lack of regulatory harmony ‘…causes many drawbacks for the entire nuclear industry, including developers, vendors, operators and even regulators themselves…This results in increased costs and reduced predictability in project execution’. vii It is therefore crucial that we harmonize the regulatory process to address these weaknesses, and avoid unnecessary duplication and inconsistencies." }, { - "type": "UncategorizedText", - "element_id": "39fa9ec190eee7b6f4dff1100d6343e1", + "type": "Footer", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2749,13 +1039,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 7 }, - "text": "60" + "text": "5" }, { - "type": "Title", - "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", + "type": "NarrativeText", + "element_id": "2cf9c478a20b21f5792941a179d926e9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2769,11 +1059,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": " Hydro" + "text": "In regard to the need for a holistic safety paradigm for the whole electricity system, we need to consider safety from a societal perspective, something the current energy system fails to do. The health, environmental and safety benefits of nuclear energy are not sufficiently understood and valued when compared with other electricity sources. Nuclear energy remains the safest form of electricity generation (Figure 4). Additionally, the use of nuclear consistently prevents many tens of thousands of deaths (mainly resulting from air pollution) every year by avoiding the use of coal - lifesaving measures which must be better recognised and valued." }, { - "type": "Title", - "element_id": "f35457739b3bd74c61625c986c844726", + "type": "UncategorizedText", + "element_id": "dbae772db29058a88f9bd830e957c695", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2787,11 +1077,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": " Nuclear" + "text": "140" }, { - "type": "UncategorizedText", - "element_id": "bbf3f11cb5b43e700273a78d12de55e4", + "type": "NarrativeText", + "element_id": "e11247712b3df61756970b45f019ad68", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2805,11 +1095,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "%" + "text": "r a e y" }, { - "type": "UncategorizedText", - "element_id": "1a6562590ef19d1045d06c4055742d38", + "type": "Title", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2823,11 +1113,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "50" + "text": "e" }, { - "type": "UncategorizedText", - "element_id": "d59eced1ded07f84c145592f65bdf854", + "type": "Title", + "element_id": "f83714d89302473e0e4f5399bd50e7a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2841,11 +1131,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "40" + "text": "W T" }, { "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", + "element_id": "380918b946a526640a40df5dced65167", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2859,11 +1149,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "30" + "text": "=" }, { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "type": "NarrativeText", + "element_id": "f9bb49945b60897227abdd75b5f8d39b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2877,11 +1167,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "20" + "text": "r e p s e i t i l" }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2895,11 +1185,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "10" + "text": "a t a F" }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "Image", + "element_id": "0fece208b80790baa3ae323ace21f818", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2913,11 +1203,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "0" + "text": " 140 120 120 1 : 100 99.5 : 80 71.9 1 n 60 . 1 40 : “99 : 85 7g 0245 <0.01 0 : : : > S & 3} cs s\\ é fos < < Qg eS S ew ee © RS Rs ~a S Se fe) we" }, { - "type": "Title", - "element_id": "7a1ca4ef7515f7276bae7230545829c2", + "type": "FigureCaption", + "element_id": "445676822969fb5177c0081d07449a70", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2931,11 +1221,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "France" + "text": "Figure 4. Comparison of number of fatalities due to electricity generation viii" }, { "type": "Title", - "element_id": "853637136575897a73cba3c5fb085e8c", + "element_id": "98d83a387e3ac2261daaf8d936bf3e27", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2949,11 +1239,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Sweden" + "text": "Nuclear for a sustainable tomorrow" }, { - "type": "Title", - "element_id": "2275583196d791405892aaca0d87743c", + "type": "NarrativeText", + "element_id": "1119369ba9a68924c64155762de72d8e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2967,11 +1257,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Switzerland" + "text": "Nuclear energy is already making a significant contribution to providing the world with clean and abundant electricity, and has a proven track record of being a reliable workhorse around the world. Countries like France, Sweden and Switzerland have proven that it is possible to divorce economic growth from an increase in damaging emissions and over the timescales required to effectively challenge climate change and environmental degradation (Figures 5 and 6). Nuclear can ensure that fast-growing populations achieve rising standards of living – without having to sacrifice the planet or their own well-being." }, { - "type": "NarrativeText", - "element_id": "fd1b6d076800203a708efab109d9393a", + "type": "UncategorizedText", + "element_id": "ad57366865126e55649ecb23ae1d4888", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -2985,11 +1275,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Figure 5. The importance of nuclear in ensuring clean energy systems in France, Sweden and Switzerland ix" + "text": "100" }, { "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "bbf3f11cb5b43e700273a78d12de55e4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3003,11 +1293,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "6" + "text": "%" }, { - "type": "UncategorizedText", - "element_id": "284b7e6d788f363f910f7beb1910473e", + "type": "Image", + "element_id": "e56f1d3df6ddf93348f20c095337d639", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3019,13 +1309,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "600" + "text": " 100 90 IB Coal i Gas/Oil 80 IB Biofuels/Waste 70 i Wind/Solar @ Hydro 60 @ Nuclear 50 40 30 20 10 0) " }, { "type": "UncategorizedText", - "element_id": "0604cd3138feed202ef293e062da2f47", + "element_id": "5feceb66ffc86f38d952786c6d696c79", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3037,13 +1327,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "500" + "text": "0" }, { "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "element_id": "7a1ca4ef7515f7276bae7230545829c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3055,13 +1345,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "i" + "text": "France" }, { - "type": "Title", - "element_id": "baa49be4a9f9fab3b991718e0adb565e", + "type": "FigureCaption", + "element_id": "853637136575897a73cba3c5fb085e8c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3073,13 +1363,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": " Non-hydro" + "text": "Sweden" }, { - "type": "Title", - "element_id": "293e9366a39d6ed33a894e4dbe0b8700", + "type": "FigureCaption", + "element_id": "2275583196d791405892aaca0d87743c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3091,13 +1381,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "ren. & waste" + "text": "Switzerland" }, { - "type": "UncategorizedText", - "element_id": "26d228663f13a88592a12d16cf9587ca", + "type": "FigureCaption", + "element_id": "fd1b6d076800203a708efab109d9393a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3109,13 +1399,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "400" + "text": "Figure 5. The importance of nuclear in ensuring clean energy systems in France, Sweden and Switzerland ix" }, { - "type": "Title", - "element_id": "30b160442c1de4494644bbb253d47d62", + "type": "Footer", + "element_id": "06e9d52c1720fca412803e3b07c4b228", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3127,13 +1417,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 8 }, - "text": "z=" + "text": "6" }, { "type": "Title", - "element_id": "f35457739b3bd74c61625c986c844726", + "element_id": "563a2980d46c81119e1d7d952b375a41", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3147,11 +1437,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": " Nuclear" + "text": "h W T" }, { - "type": "Title", - "element_id": "f6e172956a9472fa43f9a895f99c2836", + "type": "Image", + "element_id": "77d8044f595648ff9853b27fadd6ef94", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3165,11 +1455,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": " Natural gas" + "text": " BB Non-hydro 500 i ren. & waste 400 z= Nuclear Natural gas 300 y -— EB Hydro i oil 200 —— -— BB Coal 100" }, { - "type": "Title", - "element_id": "563a2980d46c81119e1d7d952b375a41", + "type": "FigureCaption", + "element_id": "ff8db11f410c00860c60393cc143175f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3183,11 +1473,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "h W T" + "text": "1974 1980 1985 1990 1995 2000 2005 2010 2017" }, { - "type": "UncategorizedText", - "element_id": "983bd614bb5afece5ab3b6023f71147c", + "type": "FigureCaption", + "element_id": "3b5b3755bac62d7f53eb84cadc34c528", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3201,11 +1491,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "300" + "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x" }, { - "type": "UncategorizedText", - "element_id": "27badc983df1780b60c2b3fa9d3a19a0", + "type": "NarrativeText", + "element_id": "4f5cc927b953f3c49c562a22c88f863f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3219,11 +1509,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "200" + "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." }, { - "type": "UncategorizedText", - "element_id": "0b06ee5051e3d7dd686665a41ae1f2d9", + "type": "Image", + "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3237,11 +1527,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "y ——" + "text": "One fuel pellet contains as much energy as a tonne of coal" }, { - "type": "ListItem", - "element_id": "bda050585a00f0f6cb502350559d7553", + "type": "NarrativeText", + "element_id": "0e28734a89e6f2473c6bbd5c1bdaf50e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3255,11 +1545,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "—" + "text": "Unlike other power sources, nuclear energy helps us reduce our total footprint, going beyond just the environment. When accounting for factors such as cost (e.g. fuel and construction costs), carbon (lifecycle greenhouse gas emissions), water and land footprints, nuclear is far ahead of all other energy generators." }, { - "type": "Title", - "element_id": "b449cd843dc44ab907e1e9ed9c30d92e", + "type": "NarrativeText", + "element_id": "81a65c45b597c6647c9f984f7b2e3554", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3273,11 +1563,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": " Hydro" + "text": "Nuclear energy offers a multitude of services beyond just electricity. With nuclear, we can decarbonize the way we heat our homes, provide process heat for industry, and ensure access to clean water. As freshwater supplies come under increasing pressure worldwide, nuclear reactors can provide desalination, ensuring a reliable flow of fresh drinking water in areas where it is scarce." }, { - "type": "Title", - "element_id": "b001a2374d44e3085e712bb40f66270e", + "type": "Header", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3291,11 +1581,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": " Oil" + "text": "7" }, { - "type": "Title", - "element_id": "90ad0c8c14253135efd14645e0156145", + "type": "NarrativeText", + "element_id": "4c23c5c4e459d5f3f6f62cc6a06a816a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3307,13 +1597,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": " Coal" + "text": "Nuclear energy can be relied upon to power the new mobility revolution taking place. Every day, we use almost 20 million barrels of oil to power our vehicles. By swapping to an electric or hydrogen-powered transport fleet – all powered by the atom – we are able to address one of the key challenges to a sustainable economy." }, { - "type": "UncategorizedText", - "element_id": "ad57366865126e55649ecb23ae1d4888", + "type": "NarrativeText", + "element_id": "cd055b546424c5003939bb047a56abf0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3325,13 +1615,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "100" + "text": "We cannot afford to wait – we need every part of the puzzle to contribute towards solving some of the greatest challenges faced by humankind in a very long time. The impacts of climate change will hit the poorest and most vulnerable first, and failing to act will have significant humanitarian consequences." }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "NarrativeText", + "element_id": "a654080ea22f70c397bca52fee82b82f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3343,13 +1633,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "0" + "text": "Nuclear power is the silent giant of today’s energy system – it runs quietly in the background, capable of delivering immense amounts of power, regardless of weather or season, allowing us to focus on everything else in life. It is a technology that is available now, and can be expanded quickly across the world to help us solve some of the most defining challenges we face. Nuclear energy holds the potential to herald a new, cleaner and truly sustainable world – enabling us to pass on a cleaner planet to our children." }, { - "type": "UncategorizedText", - "element_id": "ec54e99514663edb97adef400fbf34a7", + "type": "Title", + "element_id": "e56261e0bd30965b8e68ed2abb15b141", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3361,13 +1651,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "1974" + "text": "References" }, { - "type": "UncategorizedText", - "element_id": "a2c54f65d066210267b404e8386a7f4c", + "type": "Title", + "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3379,13 +1669,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "1980 1985 1990 1995 2000 2005 2010" + "text": "i" }, { - "type": "UncategorizedText", - "element_id": "46e67c525617663b392a53c0e94ba79e", + "type": "Title", + "element_id": "5d7f49449ab22deac22d767b89549c55", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3397,13 +1687,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "2017" + "text": "ii" }, { - "type": "NarrativeText", - "element_id": "0ad07326f56e66781da5dbb9488eaa67", + "type": "Title", + "element_id": "f5557d4fcf727a981a3c315aca733eef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3415,13 +1705,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand”" + "text": "iii" }, { - "type": "NarrativeText", - "element_id": "4f5cc927b953f3c49c562a22c88f863f", + "type": "Title", + "element_id": "0ab306823035661bb8dba21cc2535231", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3433,13 +1723,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." + "text": "iv" }, { - "type": "Image", - "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", + "type": "Title", + "element_id": "d3fc2842ddfad4c8d3859f84d4439bfd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3451,13 +1741,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "One fuel pellet contains as much energy as a tonne of coal" + "text": "Vv" }, { - "type": "NarrativeText", - "element_id": "0e28734a89e6f2473c6bbd5c1bdaf50e", + "type": "Title", + "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3469,13 +1759,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "Unlike other power sources, nuclear energy helps us reduce our total footprint, going beyond just the environment. When accounting for factors such as cost (e.g. fuel and construction costs), carbon (lifecycle greenhouse gas emissions), water and land footprints, nuclear is far ahead of all other energy generators." + "text": "v" }, { - "type": "NarrativeText", - "element_id": "81a65c45b597c6647c9f984f7b2e3554", + "type": "Title", + "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3487,13 +1777,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "Nuclear energy offers a multitude of services beyond just electricity. With nuclear, we can decarbonize the way we heat our homes, provide process heat for industry, and ensure access to clean water. As freshwater supplies come under increasing pressure worldwide, nuclear reactors can provide desalination, ensuring a reliable flow of fresh drinking water in areas where it is scarce." + "text": "vi" }, { - "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "type": "Title", + "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3505,13 +1795,13 @@ "date_modified": "2023-02-12T10:10:36" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "7" + "text": "vi" }, { - "type": "NarrativeText", - "element_id": "4c23c5c4e459d5f3f6f62cc6a06a816a", + "type": "Title", + "element_id": "c1d2906220d1eef1b17422b7132872a8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3525,11 +1815,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Nuclear energy can be relied upon to power the new mobility revolution taking place. Every day, we use almost 20 million barrels of oil to power our vehicles. By swapping to an electric or hydrogen-powered transport fleet – all powered by the atom – we are able to address one of the key challenges to a sustainable economy." + "text": "vii" }, { "type": "NarrativeText", - "element_id": "cd055b546424c5003939bb047a56abf0", + "element_id": "de72de35f0092bdd3107011f3be18dc0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3543,11 +1833,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "We cannot afford to wait – we need every part of the puzzle to contribute towards solving some of the greatest challenges faced by humankind in a very long time. The impacts of climate change will hit the poorest and most vulnerable first, and failing to act will have significant humanitarian consequences." + "text": "International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ – Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions – with visual modification by World Nuclear Association. International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT – with visual modifications by World Nuclear Association. International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https://www.ipcc.ch/sr15/ International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity – 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf" }, { - "type": "NarrativeText", - "element_id": "a654080ea22f70c397bca52fee82b82f", + "type": "Title", + "element_id": "ed171375d0bf81eaa5512140c3a29b8f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3561,11 +1851,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Nuclear power is the silent giant of today’s energy system – it runs quietly in the background, capable of delivering immense amounts of power, regardless of weather or season, allowing us to focus on everything else in life. It is a technology that is available now, and can be expanded quickly across the world to help us solve some of the most defining challenges we face. Nuclear energy holds the potential to herald a new, cleaner and truly sustainable world – enabling us to pass on a cleaner planet to our children." + "text": "ix" }, { - "type": "Title", - "element_id": "69824d3b0e70ca6aaa0da1613b65fd91", + "type": "ListItem", + "element_id": "c5693c397679aaeed0a80ac0c6b6dd20", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3579,11 +1869,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "References" + "text": "x bid." }, { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "type": "ListItem", + "element_id": "9ec2f70cbe42f5dc5073a88246db2b7a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3597,11 +1887,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "i" + "text": "and NRC SOARCA study 2015 Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" }, { - "type": "ListItem", - "element_id": "ffc47b19bb43cce8c23421b5c78b17b4", + "type": "UncategorizedText", + "element_id": "5897aff759a5cc8d94710101c73af296", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3615,7 +1905,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "i nternational Energy Agency (20 results Nuclear Association. ii nternational iii nternational Energy Agency (20 publications/nuclear/ 8), World Energy Outloo! Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=>WORLD&year=20 =chart&dataTable=ELECTRICITYANDHEAT - with visual modifications by World Nuclear Association. 9), Nuclear Power in a CI 2018. Data accessed from https://www.iea.org/weo/ — Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the ikely to stem from the implementation of announced policy intentions — with visual modification by World 6&category=Electricity&indicator=ElecGenByFuel&mode lean Energy System. Accessed from: https://www.iea.org/ iv Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https:/Awww.ipce.ch/sr15/ Vv nternational Energy Agency (20 publications/nuclear/ vi nternational vii International Publications/PDF/P1695_web.pdf and NRC SOARCA study 2015 ix nternational x bid. 9), Nuclear Power in a CI Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs o 2015 Edition. Accessed from: https:/Awww.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf Atomic Energy Agency (2015), Technical challenges in the application and instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview lean Energy System. Accessed from: https://www.iea.org/ generating Electricity — icensing of digital Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" + "text": "and NRC SOARCA study 2015 International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview Ibid." }, { "type": "NarrativeText", @@ -3636,8 +1926,8 @@ "text": "Photo credits: Front cover: Mike Baird; page 2: Vattenfall; page 4: Getty Images; page 5: Adobe Stock; page 6: Rosatom; page 8: Dean Calma, IAEA; page 10: Kazatomprom; page 11: EDF." }, { - "type": "UncategorizedText", - "element_id": "2c624232cdd221771294dfbb310aca00", + "type": "Footer", + "element_id": "aa67a169b0bba217aa0aa88a65346920", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3654,8 +1944,8 @@ "text": "8" }, { - "type": "UncategorizedText", - "element_id": "481e5a54650b0a4ac7bc2568ddad436d", + "type": "NarrativeText", + "element_id": "c48603fd38d3449d3afcd2dc18903083", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3673,7 +1963,7 @@ }, { "type": "NarrativeText", - "element_id": "cff66c7267104eeade830b3dc8675acd", + "element_id": "de49f1c955d7c8a4d1d6d261c1cf21ba", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3690,8 +1980,8 @@ "text": "The Silent Giant © 2019 World Nuclear Association. Registered in England and Wales, company number 01215741" }, { - "type": "Title", - "element_id": "2ef1a5c0752085d3a6935132ad9e597c", + "type": "NarrativeText", + "element_id": "821daa4396c0087d9d5ee9240bc5c85c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3709,7 +1999,7 @@ }, { "type": "NarrativeText", - "element_id": "20ef77d9aa66e60f1443750cdbaa9014", + "element_id": "705da4db5e220010ddfd03d9452855e4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 715b8fc617..6b44b1a62a 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -18,8 +18,8 @@ "text": "WORLD ASSOCIATION" }, { - "type": "NarrativeText", - "element_id": "1536456ece03fdb7bdbb6b848116dfde", + "type": "Title", + "element_id": "d72f07e2c764ae90417305db928ebce1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -37,7 +37,7 @@ }, { "type": "NarrativeText", - "element_id": "38ae4eaf24988f8ff8a9f5b2eaab7449", + "element_id": "c875f7e098e5ea1b337a189c28e80ac3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -54,8 +54,8 @@ "text": "Putting nuclear risk in context and perspective" }, { - "type": "Title", - "element_id": "e2371e8e756ef68aaf76eb397e9e8f32", + "type": "NarrativeText", + "element_id": "327be60d66a34747047e1365e6bab727", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -91,7 +91,7 @@ }, { "type": "Title", - "element_id": "53d548aa01fc3eb72da15a5be7f235e2", + "element_id": "2fa985d0a50e61b09ec22c447cc4b2c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -145,7 +145,7 @@ }, { "type": "NarrativeText", - "element_id": "ee4ac543bf2035b86b6818e06e3a0a90", + "element_id": "f62c49fcf0a7960d0b509e37507d76d3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -159,7 +159,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Expanding the use of nuclear energy is essential for solving some of the biggest challenges facing humanity. Nuclear power has already played a major role in avoiding the emission of air pollutants and greenhouse gases, a role that will have to be greatly expanded in the future to ensure global energy supplies are decarbonized by 2050. Nuclear energy will also play a major part in ensuring that the transition to a low-carbon future is done in an equitable fashion, providing people across the world with a high-powered and sustainable future." + "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." }, { "type": "NarrativeText", @@ -181,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "f62c49fcf0a7960d0b509e37507d76d3", + "element_id": "ee4ac543bf2035b86b6818e06e3a0a90", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -195,10 +195,10 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." + "text": "Expanding the use of nuclear energy is essential for solving some of the biggest challenges facing humanity. Nuclear power has already played a major role in avoiding the emission of air pollutants and greenhouse gases, a role that will have to be greatly expanded in the future to ensure global energy supplies are decarbonized by 2050. Nuclear energy will also play a major part in ensuring that the transition to a low-carbon future is done in an equitable fashion, providing people across the world with a high-powered and sustainable future." }, { - "type": "UncategorizedText", + "type": "Footer", "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { @@ -216,8 +216,8 @@ "text": "1" }, { - "type": "NarrativeText", - "element_id": "f193ae2dc90e6bc6856125ad88fdab12", + "type": "Title", + "element_id": "6b3149c1769f5cd200ec2a0017b936dc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -235,7 +235,7 @@ }, { "type": "NarrativeText", - "element_id": "3cf0a9c5ad0cacc724f90abbe99664d9", + "element_id": "ce5bcf6b4fe24d62bd24d156d5bc965e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -249,11 +249,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span." + "text": "It is widely accepted that humans have skewed perceptions of risks, and the way we respond to them is shaped by these perceptions, rather than the actual threats posed. Approximately 1.35 millioni people die every year because of traffic accidents, in comparison with 257 aviation fatalities in 2019ii, yet more people are nervous about flying, fearing a rare deadly crash, than being in a fatal traffic accident. These numbers tell a powerful and well-established story: evaluations of risk are largely the result of emotions, rather than logic or facts. Although it is hard to recognize and accept that our perceptions may mislead us and curtail effective decision making, this is a well-established characteristic of humanity." }, { "type": "NarrativeText", - "element_id": "ce5bcf6b4fe24d62bd24d156d5bc965e", + "element_id": "45e9c81bf6ccdc498a6ac5640d786736", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -267,11 +267,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "It is widely accepted that humans have skewed perceptions of risks, and the way we respond to them is shaped by these perceptions, rather than the actual threats posed. Approximately 1.35 millioni people die every year because of traffic accidents, in comparison with 257 aviation fatalities in 2019ii, yet more people are nervous about flying, fearing a rare deadly crash, than being in a fatal traffic accident. These numbers tell a powerful and well-established story: evaluations of risk are largely the result of emotions, rather than logic or facts. Although it is hard to recognize and accept that our perceptions may mislead us and curtail effective decision making, this is a well-established characteristic of humanity." + "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture’s depiction of radiation has played a role in ensuring that this discrepancy has remained, be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific." }, { "type": "NarrativeText", - "element_id": "45e9c81bf6ccdc498a6ac5640d786736", + "element_id": "3cf0a9c5ad0cacc724f90abbe99664d9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -285,7 +285,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Nuclear energy and the risk of radiation is one of the most extreme cases in which perceived and actual risks have diverged. The fear of radiation, whilst pre- dating the Second World War, was firmly established by the debate on the potential impacts of low-dose radiation from the fallout from nuclear weapons testing in the early years of the Cold War. Radiation in many ways became linked with the mental imagery of nuclear war, playing an important role in increasing public concern about radiation and its health effects. There is a well-established discrepancy between fact-based risk assessments and public perception of different risks. This is very much the case with nuclear power, and this is clearly highlighted in Figure 1, with laypersons ranking nuclear power as the highest risk out of 30 activities and technologies, with experts ranking nuclear as 20th. In many ways, popular culture’s depiction of radiation has played a role in ensuring that this discrepancy has remained, be it Godzilla, The Incredible Hulk, or The Simpsons, which regularly plays on the notion of radiation from nuclear power plants causing three-eyed fish, something that has been firmly rejected as unscientific." + "text": "In reality, radiation is a natural part of life; indeed, we are all exposed to radiation every day, on average receiving 2-3 millisieverts (mSv) per year. Most of this radiation is naturally occurring, with radon gas from the ground being the main source of exposure. The nuclear industry is responsible for a very small part of radiation exposure to the public, as seen in Figure 2. To put this into perspective, eating 10 bananas or two Brazil nuts results in the same radiation dose as living nearby a nuclear power plant for a year. Humans are also naturally radioactive, and the radiation dose from sleeping next to someone else each night for a year is ten times higher than the exposure from living nearby a nuclear power plant for the same time span." }, { "type": "Title", @@ -305,9 +305,27 @@ }, "text": "Rank Order Laypersons" }, + { + "type": "Table", + "element_id": "07e04cdff751f52e042c08c1b265b6f5", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", + "version": 306475068461766865312866697521104206816, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" + }, + "date_modified": "2023-02-12T10:09:32" + }, + "filetype": "application/pdf", + "page_number": 4 + }, + "text": "_Laypersons Experts 1 2 3 Handguns 4 + Nuclear power 20 Motor vehicles 1 4 Smoking 2 17 Electric power (non-nuclear) 9 1 | + + 22 xrays 7 30 Vaccinations 25" + }, { "type": "UncategorizedText", - "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", + "element_id": "4523540f1504cd17100c4835e85b7eef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -321,11 +339,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30" + "text": "17" }, { "type": "UncategorizedText", - "element_id": "785f3ec7eb32f30b90cd0fcf3657d388", + "element_id": "624b60c58c9d8bfb6ff1886c2fd605d2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -339,11 +357,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "22" + "text": "30" }, { "type": "UncategorizedText", - "element_id": "4523540f1504cd17100c4835e85b7eef", + "element_id": "785f3ec7eb32f30b90cd0fcf3657d388", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -357,7 +375,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "17" + "text": "22" }, { "type": "UncategorizedText", @@ -559,7 +577,7 @@ }, { "type": "Title", - "element_id": "602d25f25cca4ebb709f8b48f54d99d9", + "element_id": "82a60569029ed9032f1b08891e8524c2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -573,11 +591,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Motor vehicles" + "text": "Nuclear power" }, { "type": "Title", - "element_id": "82a60569029ed9032f1b08891e8524c2", + "element_id": "602d25f25cca4ebb709f8b48f54d99d9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -591,7 +609,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "Nuclear power" + "text": "Motor vehicles" }, { "type": "Title", @@ -703,7 +721,7 @@ }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "19581e27de7ced00ff1ce50b2047e7a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -717,11 +735,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "4" + "text": "9" }, { "type": "UncategorizedText", - "element_id": "19581e27de7ced00ff1ce50b2047e7a5", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -735,11 +753,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "9" + "text": "2" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -753,11 +771,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "1" + "text": "7" }, { "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -771,11 +789,11 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "7" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -789,7 +807,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "2" + "text": "1" }, { "type": "UncategorizedText", @@ -829,7 +847,7 @@ }, { "type": "NarrativeText", - "element_id": "3d8430367bf97300ddf3963de02bb5f4", + "element_id": "0d28f703c3b3aa9fee1f9f08fa688409", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -846,8 +864,8 @@ "text": "1 The original study was published in 1978, but its findings have been confirmed by numerous studies since." }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "Footer", + "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -864,8 +882,8 @@ "text": "2" }, { - "type": "Title", - "element_id": "d6acb6d51cfc574936fc79bc06b8a371", + "type": "Image", + "element_id": "aa493f4c5f573e209dc5e56d5e2a341f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -879,11 +897,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Natural" + "text": "Natural Artificial @ 48% Radon @ 11% Medicine @ 14% Buildings & soil @ 0.4% = Fallout @ 12% Food & water @ 0.4% Miscellaneous @ 10% Cosmic @ 0.2% Occupational @ 4% = Thoron @ 0.04% Nuclear discharges " }, { - "type": "Title", - "element_id": "d6acb6d51cfc574936fc79bc06b8a371", + "type": "FigureCaption", + "element_id": "9b657ab0d2ea482c887c7877ba86598d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -897,11 +915,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Natural" + "text": "Figure 2. Global average exposure from different sources of radiation" }, { - "type": "UncategorizedText", - "element_id": "d4a293a7987bc37f4a826e0da1961aab", + "type": "NarrativeText", + "element_id": "4469b98946c004fbae47ad6285c9bba4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -915,11 +933,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": " 48% Radon  14% Buildings & soil  12% Food & water  10% Cosmic  4% Thoron" + "text": "Fossil fuels – currently accounting for around 81% of total energy supplyiv – cause significant levels of emissions in terms of both greenhouse gases and air pollutants. Despite the serious and ongoing health and environmental harms caused by air pollution, it is often considered to be an inevitable consequence of economic development. Air pollution’s contribution to the burden of disease is profound, with an estimated 8.7 million people dying worldwide prematurely in 2018 alonev,vi. Despite this, it fails to induce the same fears and anxieties in people as nuclear energy does." }, { - "type": "Title", - "element_id": "8c3274ea479fd4a25c0b5611a8e48662", + "type": "NarrativeText", + "element_id": "cbf390f564b0b1197deb5bf3dd999291", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -933,11 +951,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Artificial" + "text": "In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi." }, { "type": "UncategorizedText", - "element_id": "0f748653e413fbddbb18262352d56b23", + "element_id": "6a3adc54db5128f797d4a12855193373", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -951,11 +969,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": " 11% Medicine  0.4%  0.4% Miscellaneous  0.2% Occupational  0.04% Nuclear discharges" + "text": "24.6" }, { - "type": "Title", - "element_id": "039bede24e51e7c42ce352c25b6427c0", + "type": "NarrativeText", + "element_id": "e11247712b3df61756970b45f019ad68", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -969,11 +987,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Fallout" + "text": "r a e y" }, { - "type": "NarrativeText", - "element_id": "9b657ab0d2ea482c887c7877ba86598d", + "type": "Title", + "element_id": "3f79bb7b435b05321651daefd374cdc6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -987,11 +1005,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Figure 2. Global average exposure from different sources of radiation" + "text": "e" }, { - "type": "NarrativeText", - "element_id": "4469b98946c004fbae47ad6285c9bba4", + "type": "Title", + "element_id": "f83714d89302473e0e4f5399bd50e7a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1005,11 +1023,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Fossil fuels – currently accounting for around 81% of total energy supplyiv – cause significant levels of emissions in terms of both greenhouse gases and air pollutants. Despite the serious and ongoing health and environmental harms caused by air pollution, it is often considered to be an inevitable consequence of economic development. Air pollution’s contribution to the burden of disease is profound, with an estimated 8.7 million people dying worldwide prematurely in 2018 alonev,vi. Despite this, it fails to induce the same fears and anxieties in people as nuclear energy does." + "text": "W T" }, { "type": "NarrativeText", - "element_id": "cbf390f564b0b1197deb5bf3dd999291", + "element_id": "f9bb49945b60897227abdd75b5f8d39b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1023,11 +1041,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi." + "text": "r e p s e i t i l" }, { - "type": "UncategorizedText", - "element_id": "b7a56873cd771f2c446d369b649430b6", + "type": "Title", + "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1041,11 +1059,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "25" + "text": "a t a F" }, { - "type": "UncategorizedText", - "element_id": "6a3adc54db5128f797d4a12855193373", + "type": "Image", + "element_id": "226de27a8eeb930616d6b9c4aa4dc574", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1059,11 +1077,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "24.6" + "text": " 05 24.6 20 18.4 S15 10 46 28 5 || 0.07 0.04 0.02 0.01 > SS I ~— ~— es ° & Se es oe oe & ro se s& e as" }, { - "type": "UncategorizedText", - "element_id": "f5ca38f748a1d6eaf726b8a42fb575c3", + "type": "FigureCaption", + "element_id": "8e44807922e69a38594c4b389cd0be54", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1077,11 +1095,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "20" + "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3" }, { - "type": "UncategorizedText", - "element_id": "dfb6b8c404e0fa2b32def4ba49e00b3c", + "type": "NarrativeText", + "element_id": "bf88d949b16b32347c420a66fa413d49", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1095,11 +1113,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "18.4" + "text": "Contrary to perceptions, nuclear is an incredibly safe source of energy (see Figure 3 for a comparison). What is also clear is that the continued use of alternative energy sources in preference to nuclear energy – in particular fossil fuels – poses a far greater risk to public health by significantly contributing to climate change and air pollution." }, { - "type": "NarrativeText", - "element_id": "e11247712b3df61756970b45f019ad68", + "type": "ListItem", + "element_id": "9f9b01127f5b3b297b3759a8e205ad59", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1113,11 +1131,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "r a e y" + "text": "$ Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study." }, { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", + "type": "NarrativeText", + "element_id": "e450813fe6430d87c4caa64e4792bc74", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1131,11 +1149,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "e" + "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the" }, { - "type": "UncategorizedText", - "element_id": "e629fa6598d732768f7c726b4b621285", + "type": "Header", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1149,11 +1167,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "15" + "text": "3" }, { - "type": "UncategorizedText", - "element_id": "dca468ba69cda6650ce03d976c274c66", + "type": "Title", + "element_id": "b6812463b15ddda3f2402dfda95d2c86", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1165,13 +1183,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "S15" + "text": "The low-dose question" }, { - "type": "Title", - "element_id": "f83714d89302473e0e4f5399bd50e7a9", + "type": "NarrativeText", + "element_id": "ec0fb27e2a16f77899bf83591cd2d0de", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1183,13 +1201,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "W T" + "text": "Since the 1950s, the Linear No-Threshold (LNT) theory has been used to inform regulatory decisions, positing that any dose of radiation, regardless of the amount or the duration over which it is received, poses a risk. Assuming that LNT is correct, we should expect to see that people living in areas of the world where background doses are higher (e.g. India, Iran and northern Europe) have a higher incidence of cancer. However, despite people living in areas of the world where radiation doses are naturally higher than those that would be received in parts of the evacuation zones around Chernobyl and Fukushima Daiichi, there is no evidence that these populations exhibit any negative health effects. Living nearby a nuclear power plant on average exposes the local population to 0.00009mSv/year, which according to LNT would increase the risk of developing cancer by 0.00000045%. After Chernobyl, the average dose to those evacuated was 30mSv, which would theoretically increase the risk of cancer at some point in their lifetime by 0.15% (on top of the average baseline lifetime risk of cancer, which is 39.5% in the USviii, 50% in the UKix)." }, { "type": "NarrativeText", - "element_id": "f9bb49945b60897227abdd75b5f8d39b", + "element_id": "d6bd9451ceee595c090d110656bb1b2b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1201,13 +1219,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "r e p s e i t i l" + "text": "Since the 1980s, there has been considerable scientific debate as to whether the LNT theory is valid, following scientific breakthroughs within, for example, radiobiology and medicine. Indeed, the Chernobyl accident helped illuminate some of the issues associated with LNT. Multiplication of the low doses after the accident (many far too low to be of any health concern) with large populations – using the assumptions made by LNT – led to a large number of predicted cancer deaths, which have not, and likely will not materialize. This practice has been heavily criticized for being inappropriate in making risk assessments by UNSCEAR, the International Commission on Radiation Protection and a large number of independent scientists." }, { - "type": "UncategorizedText", - "element_id": "4a44dc15364204a80fe80e9039455cc1", + "type": "NarrativeText", + "element_id": "d8c68c0317a4a3867de201703e068e2e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1219,13 +1237,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "10" + "text": "Determining the precise risk (or lack thereof) of the extremely small radiation doses associated with the routine operations of nuclear power plants, the disposal of nuclear waste or even extremely rare nuclear accidents is a purely academic exercise, that tries to determine whether the risk is extremely low, too small to detect, or non- existent. The risks of low-level radiation pale in comparison to other societal risks such as obesity, smoking, and air pollution." }, { - "type": "Title", - "element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5", + "type": "NarrativeText", + "element_id": "e5dec03340d86adfd26612d5d06ab5e6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1237,13 +1255,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "a t a F" + "text": "By looking at radiation risks in isolation, we prolong the over-regulation of radiation in nuclear plants, driving up costs, whilst not delivering any additional health benefits, in turn incentivising the use of more harmful energy sources. A recalibration is required, and this can only done by ensuring a holistic approach to risk is taken." }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Footer", + "element_id": "7de1555df0c2700329e815b93b32c571", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1255,13 +1273,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 6 }, - "text": "5" + "text": "4" }, { - "type": "UncategorizedText", - "element_id": "8bf40d0515e8461bd30866c2eb8ac250", + "type": "Title", + "element_id": "3506b7d2b1626663985ae1a521a60fe1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1273,13 +1291,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "4.6" + "text": "Adopting an all-hazards approach" }, { - "type": "UncategorizedText", - "element_id": "c020bad937ece011339d7447ee0ac9fa", + "type": "NarrativeText", + "element_id": "07ed21008ec3f8801f7cbb1fc670d4db", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1291,13 +1309,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "2.8" + "text": "The overall regulatory philosophy, at least theoretically, used in the nuclear industry is the ALARA (As Low As Reasonably Achievable) principle, where any regulatory action on radiation should account for socio- economic benefits and costs, as opposed to making decisions based on radiation risks alone." }, { - "type": "UncategorizedText", - "element_id": "5feceb66ffc86f38d952786c6d696c79", + "type": "NarrativeText", + "element_id": "00548dbd288df8370c39789adb302f50", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1309,13 +1327,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "0" + "text": "Contemporary debates around nuclear energy often reflect the precautionary principle, a problematic concept applied across a range of regulatory and policy issues. A ‘strong’ interpretation of the precautionary principle, or a ‘as low as possible’ approach to risk, dictates that regulation is required whenever there is a potential adverse health risk, even if the evidence is not certain and regardless of the cost of regulation." }, { - "type": "Title", - "element_id": "51229f9593cbcb7c8e25059c004d67b0", + "type": "NarrativeText", + "element_id": "ba80f89ec0449fefee24b33fbb7e29b6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1327,13 +1345,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "|| es" + "text": "However, the regulatory process and the policy debate around nuclear more broadly has long departed from the ALARA principle, no longer weighing cost versus benefits, or considering the overall advantages of nuclear energy, but rather looking at radiation in isolation. This has resulted in a subtle shift towards an ‘as low as possible’ mentality. Attempting to reduce radiation far below de facto safe levels has resulted in an escalation of costs and loss of public confidence, and in some cases has deprived communities of the many benefits nuclear energy provides. In practical terms, this has led to the continued use of more harmful energy sources, such as fossil fuels." }, { - "type": "Title", - "element_id": "6c25ebfc9ffd2510c4c41d4bd5cb7ea9", + "type": "NarrativeText", + "element_id": "9e9ed8938e271667a9512898d2ca629b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1345,13 +1363,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "C oal" + "text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs – be they economic, environmental, or public health – associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power." }, { - "type": "Title", - "element_id": "2378bdd2cf4f491cf401e6b215cbb4fd", + "type": "Image", + "element_id": "72b1be8b707acf2f917fef7ea176ec32", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1363,13 +1381,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "Oil" + "text": "ae) Plant-level Social and flea productio Grid-level costs environmental costs of at market pri of the electricity emissions, land-use, system climate change, security of supply, etc. " }, { - "type": "Title", - "element_id": "3a21fb0158c2ea04834163deee74a836", + "type": "FigureCaption", + "element_id": "567f470fb4fb5c58b115fbe79a425970", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1381,13 +1399,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "Bio m ass" + "text": "Figure 4. The different levels of cost associated with electricity generationx" }, { - "type": "Title", - "element_id": "4fabb98454d019811a732c4a09f31bf0", + "type": "NarrativeText", + "element_id": "6595e50969f899bd2fa05c0d7a8a682c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1399,13 +1417,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "N atural gas" + "text": "A more holistic regulatory process would be required, in which regulators move away from being siloed, looking at specific risks in isolation, with little regard for the greater picture. The move towards an all-hazard, holistic approach would require greater coordination between regulators, ensuring that the combined risks of a specific nuclear project are weighed against the risks posed by not advancing said project." }, { - "type": "Title", - "element_id": "d151346fe7eea3c6a0865199579ca601", + "type": "NarrativeText", + "element_id": "07958b72a8f6127e362d9ce84be7ea54", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1417,13 +1435,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "Wind" + "text": "Equally, the adoption of an all-hazards approach means regulators should consider declaring when a risk is too low to be a public health concern, in line with what the U.S. Nuclear Regulatory Commission attempted to do with its Below Regulatory Concern policy statements in the 1980s and early 1990s. In the context of nuclear power, this means departing from the notion that LNT instils of no safe level of radiation, and adopting a regulatory framework which notes the impossibility of eradicating risks. Failing to do so will result in excessive regulation that continues to limit the full potential of nuclear power in tackling climate change and sees a continued reliance on objectively more harmful energy sources." }, { - "type": "UncategorizedText", - "element_id": "91539d7445b231b3612c4f68bd077160", + "type": "Header", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1435,13 +1453,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 7 }, - "text": "0.07" + "text": "5" }, { - "type": "NarrativeText", - "element_id": "5275a384f63ded9bf8541f52dec2c2cb", + "type": "Title", + "element_id": "75ed57ac08703850c3e6aa55ac4aea97", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1453,13 +1471,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 8 }, - "text": "H ydropo w er" + "text": "Recalibrating the risk conversation" }, { - "type": "UncategorizedText", - "element_id": "a888fe9e2469182b8e3e3bca241d3189", + "type": "NarrativeText", + "element_id": "7cb6cd150bb2cc2a0f10ba8584c285c7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1471,13 +1489,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 8 }, - "text": "0.04" + "text": "By looking at radiation risks in isolation, we have created something akin to a “radiation phobia”, that both directly and indirectly harms people around the world. For instance, it is well established that the vast majority of health impacts from Chernobyl and Fukushima Daiichi were not radiological, but rather psychosocial. There has been an observable and dramatic increase in depression, PTSD, substance abuse, and suicides following these events, which can be significantly attributed to the dissonance between the actual and perceived risks of radiation, and the stigmatization they caused." }, { - "type": "Title", - "element_id": "d3d1de6bcd7ebe2351be9f53551f7eb9", + "type": "NarrativeText", + "element_id": "5165336fa7f2d57e7fa5030f6b4f6a24", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1489,31 +1507,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 8 }, - "text": "S olar" - }, - { - "type": "UncategorizedText", - "element_id": "a7e46abf169710b34fe8898b950d57ec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "0.02" + "text": "Similarly, many of the tremendous challenges the global community faces are significantly driven by this “radiation phobia”. Indeed, several of these issues have been considerably exacerbated by the fact that certain risks are given a disproportionate amount of focus, whereas others are de facto ignored. The global conversation around climate change is a prime example of this. The historical use of fossil fuels has contributed significantly to climate change through greenhouse gas emissions, causing unprecedented changes in the liveability of the Earth. By 2025, half of the world’s population will be living in water-stressed areas, as extreme heat and droughts are exacerbating water resources. Between 2030 and 2050, climate change is expected to be the cause of an additional 250,000 deaths per year, arising from malnutrition, malaria, diarrhoea and heat stressx. Yet, despite the huge risks associated with climate change, our addiction to coal, oil, and fossil gas remains, with fossil fuels providing 84% of global primary energy in 2019xii. The continued prioritization of fossil fuels at the expense of nuclear energy results in a considerable increase in the risks posed by climate change." }, { - "type": "Title", - "element_id": "6dc76d1e1c35d4253537250288157d0c", + "type": "FigureCaption", + "element_id": "29215d2c137a392941315c6c7a67e8fd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1525,13 +1525,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 8 }, - "text": "N uclear" + "text": "Equally, it is well established that living without access to electricity results in illness and death around the world, caused by everything from not having access to modern healthcare to household air pollution. As of today, 770 million people around the world do not have access to electricity, with over 75% of that population living in Sub-Saharan Africa. The world's poorest 4 billion people consume a mere 5% of the energy used in developed economies, and we need to find ways of delivering reliable electricity to the entire human population in a fashion that is sustainable. Household and ambient air pollution causes 8.7 million deaths each year, largely because of the continued use of fossil fuels. Widespread electrification is a key tool for delivering a just energy transition. Investment in nuclear, has become an urgent necessity. Discarding it, based on risk perceptions divorced from science, would be to abandon the moral obligation to ensure affordable, reliable, and sustainable energy for every community around the world." }, { - "type": "UncategorizedText", - "element_id": "312b95ee5a344d2f7a16ad817ff70788", + "type": "Header", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1543,13 +1543,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 8 }, - "text": "0.01" + "text": "6" }, { "type": "NarrativeText", - "element_id": "a9d31d88b0e2026dbed12c8b5536ab2b", + "element_id": "d754d8d468346f652657279272a11897", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1561,13 +1561,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 9 }, - "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution®" + "text": "Clearly, we have reached a point where we must establish a new conversation about the relative risks of using nuclear, especially when risks created by other energy sources are considered. We cannot address many of the global challenges we face without a significant increase in the use of nuclear energy. The detrimental effects of decades of looking at nuclear risks in isolation highlights just how crucial it is that regulators and policymakers change the way they view nuclear energy, and transition towards an all-hazards approach, ensuring that actions taken to mitigate risks do not result in creating more severe risks." }, { "type": "NarrativeText", - "element_id": "bf88d949b16b32347c420a66fa413d49", + "element_id": "0714f9ff88637006bdb76908c7c936bf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1579,13 +1579,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 9 }, - "text": "Contrary to perceptions, nuclear is an incredibly safe source of energy (see Figure 3 for a comparison). What is also clear is that the continued use of alternative energy sources in preference to nuclear energy – in particular fossil fuels – poses a far greater risk to public health by significantly contributing to climate change and air pollution." + "text": "We must begin to holistically look at the severity of the consequences of maintaining the current energy production system, many of which are irreversible. The ways in which we address climate change and other issues of global importance must be sustainable and not create new hazards down the line. The reality is that nuclear has always been and remains an exceptionally safe source of energy, representing the lowest risk, the most sustainable, and the most affordable ways to generate around-the-clock electricity." }, { "type": "NarrativeText", - "element_id": "1ff44442b3a554331aaf4ffb30b7eda6", + "element_id": "f62c49fcf0a7960d0b509e37507d76d3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1597,13 +1597,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 9 }, - "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. $ Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study." + "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Footer", + "element_id": "7902699be42c8a8e46fbbb4501726517", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1615,103 +1615,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 5 + "page_number": 9 }, - "text": "3" + "text": "7" }, { "type": "Title", - "element_id": "f5bda7d6ba9ea7120d7f4c11c8b8f1ae", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "The low-dose question" - }, - { - "type": "NarrativeText", - "element_id": "ec0fb27e2a16f77899bf83591cd2d0de", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Since the 1950s, the Linear No-Threshold (LNT) theory has been used to inform regulatory decisions, positing that any dose of radiation, regardless of the amount or the duration over which it is received, poses a risk. Assuming that LNT is correct, we should expect to see that people living in areas of the world where background doses are higher (e.g. India, Iran and northern Europe) have a higher incidence of cancer. However, despite people living in areas of the world where radiation doses are naturally higher than those that would be received in parts of the evacuation zones around Chernobyl and Fukushima Daiichi, there is no evidence that these populations exhibit any negative health effects. Living nearby a nuclear power plant on average exposes the local population to 0.00009mSv/year, which according to LNT would increase the risk of developing cancer by 0.00000045%. After Chernobyl, the average dose to those evacuated was 30mSv, which would theoretically increase the risk of cancer at some point in their lifetime by 0.15% (on top of the average baseline lifetime risk of cancer, which is 39.5% in the USviii, 50% in the UKix)." - }, - { - "type": "NarrativeText", - "element_id": "d6bd9451ceee595c090d110656bb1b2b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Since the 1980s, there has been considerable scientific debate as to whether the LNT theory is valid, following scientific breakthroughs within, for example, radiobiology and medicine. Indeed, the Chernobyl accident helped illuminate some of the issues associated with LNT. Multiplication of the low doses after the accident (many far too low to be of any health concern) with large populations – using the assumptions made by LNT – led to a large number of predicted cancer deaths, which have not, and likely will not materialize. This practice has been heavily criticized for being inappropriate in making risk assessments by UNSCEAR, the International Commission on Radiation Protection and a large number of independent scientists." - }, - { - "type": "NarrativeText", - "element_id": "d8c68c0317a4a3867de201703e068e2e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "Determining the precise risk (or lack thereof) of the extremely small radiation doses associated with the routine operations of nuclear power plants, the disposal of nuclear waste or even extremely rare nuclear accidents is a purely academic exercise, that tries to determine whether the risk is extremely low, too small to detect, or non- existent. The risks of low-level radiation pale in comparison to other societal risks such as obesity, smoking, and air pollution." - }, - { - "type": "NarrativeText", - "element_id": "e5dec03340d86adfd26612d5d06ab5e6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "By looking at radiation risks in isolation, we prolong the over-regulation of radiation in nuclear plants, driving up costs, whilst not delivering any additional health benefits, in turn incentivising the use of more harmful energy sources. A recalibration is required, and this can only done by ensuring a holistic approach to risk is taken." - }, - { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "e56261e0bd30965b8e68ed2abb15b141", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1723,85 +1633,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 6 + "page_number": 10 }, - "text": "4" + "text": "References" }, { "type": "Title", - "element_id": "3506b7d2b1626663985ae1a521a60fe1", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Adopting an all-hazards approach" - }, - { - "type": "NarrativeText", - "element_id": "ba80f89ec0449fefee24b33fbb7e29b6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "However, the regulatory process and the policy debate around nuclear more broadly has long departed from the ALARA principle, no longer weighing cost versus benefits, or considering the overall advantages of nuclear energy, but rather looking at radiation in isolation. This has resulted in a subtle shift towards an ‘as low as possible’ mentality. Attempting to reduce radiation far below de facto safe levels has resulted in an escalation of costs and loss of public confidence, and in some cases has deprived communities of the many benefits nuclear energy provides. In practical terms, this has led to the continued use of more harmful energy sources, such as fossil fuels." - }, - { - "type": "NarrativeText", - "element_id": "07ed21008ec3f8801f7cbb1fc670d4db", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "The overall regulatory philosophy, at least theoretically, used in the nuclear industry is the ALARA (As Low As Reasonably Achievable) principle, where any regulatory action on radiation should account for socio- economic benefits and costs, as opposed to making decisions based on radiation risks alone." - }, - { - "type": "NarrativeText", - "element_id": "00548dbd288df8370c39789adb302f50", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Contemporary debates around nuclear energy often reflect the precautionary principle, a problematic concept applied across a range of regulatory and policy issues. A ‘strong’ interpretation of the precautionary principle, or a ‘as low as possible’ approach to risk, dictates that regulation is required whenever there is a potential adverse health risk, even if the evidence is not certain and regardless of the cost of regulation." - }, - { - "type": "NarrativeText", - "element_id": "9e9ed8938e271667a9512898d2ca629b", + "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1813,13 +1651,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 7 + "page_number": 10 }, - "text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs – be they economic, environmental, or public health – associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power." + "text": "i" }, { - "type": "Title", - "element_id": "7ec686735b6e51f8276b057051369b15", + "type": "ListItem", + "element_id": "c06ac75f019ceac1ff2baecfc090fd3e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1831,13 +1669,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 7 + "page_number": 10 }, - "text": "ae) flea" + "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" }, { "type": "Title", - "element_id": "2470c376b60fd11fd9639e0e440ce0f5", + "element_id": "5d7f49449ab22deac22d767b89549c55", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1849,103 +1687,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 7 + "page_number": 10 }, - "text": "Plant-level production costs at market prices" + "text": "ii" }, { "type": "Title", - "element_id": "dde91891334d5ac0e2b4569680eb6f1e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Grid-level costs of the electricity system" - }, - { - "type": "UncategorizedText", - "element_id": "fd38688f30f8b6e597d540ab0134278f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Social and environmental costs of emissions, land-use, climate change, security of supply, etc." - }, - { - "type": "NarrativeText", - "element_id": "567f470fb4fb5c58b115fbe79a425970", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Figure 4. The different levels of cost associated with electricity generationx" - }, - { - "type": "NarrativeText", - "element_id": "6595e50969f899bd2fa05c0d7a8a682c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "A more holistic regulatory process would be required, in which regulators move away from being siloed, looking at specific risks in isolation, with little regard for the greater picture. The move towards an all-hazard, holistic approach would require greater coordination between regulators, ensuring that the combined risks of a specific nuclear project are weighed against the risks posed by not advancing said project." - }, - { - "type": "NarrativeText", - "element_id": "07958b72a8f6127e362d9ce84be7ea54", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "Equally, the adoption of an all-hazards approach means regulators should consider declaring when a risk is too low to be a public health concern, in line with what the U.S. Nuclear Regulatory Commission attempted to do with its Below Regulatory Concern policy statements in the 1980s and early 1990s. In the context of nuclear power, this means departing from the notion that LNT instils of no safe level of radiation, and adopting a regulatory framework which notes the impossibility of eradicating risks. Failing to do so will result in excessive regulation that continues to limit the full potential of nuclear power in tackling climate change and sees a continued reliance on objectively more harmful energy sources." - }, - { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1957,13 +1705,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 7 + "page_number": 10 }, - "text": "5" + "text": "v" }, { "type": "Title", - "element_id": "75ed57ac08703850c3e6aa55ac4aea97", + "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1975,13 +1723,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 10 }, - "text": "Recalibrating the risk conversation" + "text": "vi" }, { - "type": "NarrativeText", - "element_id": "7cb6cd150bb2cc2a0f10ba8584c285c7", + "type": "ListItem", + "element_id": "af64bcc9f6d36d2c339a592dc2ae75ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1993,13 +1741,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 10 }, - "text": "By looking at radiation risks in isolation, we have created something akin to a “radiation phobia”, that both directly and indirectly harms people around the world. For instance, it is well established that the vast majority of health impacts from Chernobyl and Fukushima Daiichi were not radiological, but rather psychosocial. There has been an observable and dramatic increase in depression, PTSD, substance abuse, and suicides following these events, which can be significantly attributed to the dissonance between the actual and perceived risks of radiation, and the stigmatization they caused." + "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." }, { - "type": "NarrativeText", - "element_id": "5165336fa7f2d57e7fa5030f6b4f6a24", + "type": "ListItem", + "element_id": "18b2cdcbf43cbcab942c6ffa69abdc51", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2011,13 +1759,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 10 }, - "text": "Similarly, many of the tremendous challenges the global community faces are significantly driven by this “radiation phobia”. Indeed, several of these issues have been considerably exacerbated by the fact that certain risks are given a disproportionate amount of focus, whereas others are de facto ignored. The global conversation around climate change is a prime example of this. The historical use of fossil fuels has contributed significantly to climate change through greenhouse gas emissions, causing unprecedented changes in the liveability of the Earth. By 2025, half of the world’s population will be living in water-stressed areas, as extreme heat and droughts are exacerbating water resources. Between 2030 and 2050, climate change is expected to be the cause of an additional 250,000 deaths per year, arising from malnutrition, malaria, diarrhoea and heat stressx. Yet, despite the huge risks associated with climate change, our addiction to coal, oil, and fossil gas remains, with fossil fuels providing 84% of global primary energy in 2019xii. The continued prioritization of fossil fuels at the expense of nuclear energy results in a considerable increase in the risks posed by climate change." + "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." }, { - "type": "NarrativeText", - "element_id": "29215d2c137a392941315c6c7a67e8fd", + "type": "ListItem", + "element_id": "46c6ddac9c0dadbc38d874f4b35fa235", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2029,13 +1777,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 10 }, - "text": "Equally, it is well established that living without access to electricity results in illness and death around the world, caused by everything from not having access to modern healthcare to household air pollution. As of today, 770 million people around the world do not have access to electricity, with over 75% of that population living in Sub-Saharan Africa. The world's poorest 4 billion people consume a mere 5% of the energy used in developed economies, and we need to find ways of delivering reliable electricity to the entire human population in a fashion that is sustainable. Household and ambient air pollution causes 8.7 million deaths each year, largely because of the continued use of fossil fuels. Widespread electrification is a key tool for delivering a just energy transition. Investment in nuclear, has become an urgent necessity. Discarding it, based on risk perceptions divorced from science, would be to abandon the moral obligation to ensure affordable, reliable, and sustainable energy for every community around the world." + "text": "National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/ understanding/statistics" }, { - "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "type": "ListItem", + "element_id": "acdfef838c7c3dd2d1d6bfe41f4156e6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2047,13 +1795,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 8 + "page_number": 10 }, - "text": "6" + "text": "Cancer Research UK (n.d.). Cancer risk statistics. Available at: https:/Awww.cancerresearchuk.org/health- professional/cancer-statistics/risk" }, { - "type": "NarrativeText", - "element_id": "0714f9ff88637006bdb76908c7c936bf", + "type": "ListItem", + "element_id": "6febbd0bffa8633c6c188165767c843c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2065,13 +1813,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "We must begin to holistically look at the severity of the consequences of maintaining the current energy production system, many of which are irreversible. The ways in which we address climate change and other issues of global importance must be sustainable and not create new hazards down the line. The reality is that nuclear has always been and remains an exceptionally safe source of energy, representing the lowest risk, the most sustainable, and the most affordable ways to generate around-the-clock electricity." + "text": "United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https:/Avww.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" }, { - "type": "NarrativeText", - "element_id": "f62c49fcf0a7960d0b509e37507d76d3", + "type": "ListItem", + "element_id": "2f9b2ba9ed7265891caea2b618d2968c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2083,13 +1831,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "Therefore, World Nuclear Association calls upon policymakers and regulators to adopt an all-hazards approach, where different risks associated with energy producing technologies are placed in perspective and the appropriate context, and examined in line with the latest scientific evidence. Policymakers and regulators must ensure that their decisions regarding radiation protection do not create greater risks elsewhere. This include the recalibration of existing regulations regarding nuclear power and radiation, weighing the cost of regulatory measures against the societal benefits provided by nuclear energy." + "text": "VIL World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" }, { - "type": "NarrativeText", - "element_id": "d754d8d468346f652657279272a11897", + "type": "ListItem", + "element_id": "0765b3700a8d5cdd4e4cdb9283835ade", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2101,13 +1849,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "Clearly, we have reached a point where we must establish a new conversation about the relative risks of using nuclear, especially when risks created by other energy sources are considered. We cannot address many of the global challenges we face without a significant increase in the use of nuclear energy. The detrimental effects of decades of looking at nuclear risks in isolation highlights just how crucial it is that regulators and policymakers change the way they view nuclear energy, and transition towards an all-hazards approach, ensuring that actions taken to mitigate risks do not result in creating more severe risks." + "text": "OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https:/Avww.oecd-nea.org/jcms/pl_14998/ the-full-costs-of-electricity-provision?details=true" }, { - "type": "UncategorizedText", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "type": "ListItem", + "element_id": "8bfb0188dff570fe23d75b3873051528", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2119,13 +1867,13 @@ "date_modified": "2023-02-12T10:09:32" }, "filetype": "application/pdf", - "page_number": 9 + "page_number": 10 }, - "text": "7" + "text": "xi World Health Organization (2018). Climate change and health. Available at: https:/Awww.who.int/news-room/fact- sheets/detail/climate-change-and-health" }, { - "type": "Title", - "element_id": "69824d3b0e70ca6aaa0da1613b65fd91", + "type": "ListItem", + "element_id": "69bd2cd5a46ac8850a9e3ea2df80de60", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2139,11 +1887,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "References" + "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" }, { - "type": "Title", - "element_id": "de7d1b721a1e0632b7cf04edf5032c8e", + "type": "ListItem", + "element_id": "81be06e67a1b533cb1278b15860c51db", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2157,11 +1905,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "i" + "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" }, { "type": "ListItem", - "element_id": "158d56841d65947a9a91a3ca34163a4c", + "element_id": "199440a0821e16b612f4697aa2306cb2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2175,11 +1923,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Vi VIL xi xii World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https:/Awww.bbc.co.uk/news/ business-50953712 Slovic, P, 2010. The Psychology of risk. Sauide e Sociedade, 19(4), pp. 731-747. United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https:/Avww.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018 Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8 World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021] National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/ understanding/statistics Cancer Research UK (n.d.). Cancer risk statistics. Available at: https:/Awww.cancerresearchuk.org/health- professional/cancer-statistics/risk OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https:/Avww.oecd-nea.org/jcms/pl_14998/ the-full-costs-of-electricity-provision?details=true World Health Organization (2018). Climate change and health. Available at: https:/Awww.who.int/news-room/fact- sheets/detail/climate-change-and-health BP 2020. BP Statistical Review of World Energy, London: BP" + "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" }, { "type": "NarrativeText", - "element_id": "b6c39a9b3890b5132e4310c83d06b310", + "element_id": "10407d498f2636f50597e71d97cc001a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2196,7 +1944,7 @@ "text": "Photo credits: Front cover & pages 1, 4, 6 left, 7 bottom: Adobe Stock; page 6 right: Getty Images; page 7 top: Uniper." }, { - "type": "UncategorizedText", + "type": "Header", "element_id": "2c624232cdd221771294dfbb310aca00", "metadata": { "data_source": { @@ -2214,8 +1962,8 @@ "text": "8" }, { - "type": "UncategorizedText", - "element_id": "481e5a54650b0a4ac7bc2568ddad436d", + "type": "NarrativeText", + "element_id": "c48603fd38d3449d3afcd2dc18903083", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2233,7 +1981,7 @@ }, { "type": "NarrativeText", - "element_id": "36d3613fc20527bb317afd4e447d1c74", + "element_id": "fc5faebaec5a1349ce932f1863bdd842", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2250,8 +1998,8 @@ "text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741" }, { - "type": "Title", - "element_id": "2ef1a5c0752085d3a6935132ad9e597c", + "type": "NarrativeText", + "element_id": "821daa4396c0087d9d5ee9240bc5c85c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2269,7 +2017,7 @@ }, { "type": "NarrativeText", - "element_id": "20ef77d9aa66e60f1443750cdbaa9014", + "element_id": "705da4db5e220010ddfd03d9452855e4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d71d465e92..3d63527b85 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev9" # pragma: no cover +__version__ = "0.10.19-dev10" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 75c15a3e36..7fa34f1d84 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -617,7 +617,8 @@ class Footer(Text): "Page-footer": Footer, "Page-header": Header, # Title? "Picture": Image, - "Section-header": Header, + # this mapping favors ensures yolox produces backward compatible categories + "Section-header": Title, "Headline": Title, "Subheadline": Title, "Abstract": NarrativeText, diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 5c1c3cbfd4..ddd6e7b845 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -70,6 +70,13 @@ RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) +def default_hi_res_model() -> str: + # a light config for the hi res model; this is not defined as a constant so that no setting of + # the default hi res model name is done on importing of this submodule; this allows (if user + # prefers) for setting env after importing the sub module and changing the default model name + return os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "yolox_quantized") + + @process_metadata() @add_metadata_with_filetype(FileType.PDF) @add_chunking_strategy() @@ -329,11 +336,7 @@ def _partition_pdf_or_image_local( ocr_languages = prepare_languages_for_tesseract(languages) - model_name = ( - model_name - if model_name - else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "detectron2_onnx") - ) + model_name = model_name or default_hi_res_model() pdf_image_dpi = kwargs.pop("pdf_image_dpi", None) extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False) image_output_dir_path = kwargs.get("image_output_dir_path", None)