From 77f07baf83c720f67585c14a8983dd3ad7d416a7 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Tue, 8 Aug 2023 11:44:25 +0200 Subject: [PATCH 1/2] Move test documents to text/test/data from widget tests --- MANIFEST.in | 5 +-- .../corrupted/sample_pdf_corrupted.pdf | Bin 7455 -> 7015 bytes .../data/documents/good/minimal-document.pdf | Bin 0 -> 16978 bytes .../data/documents/good/sample_docx.docx | Bin .../tests/data/documents/good/sample_odt.odt | Bin .../tests/data/documents/good/sample_pdf.pdf | Bin .../documents/good/sample_txt_\305\276.txt" | 0 .../widgets/tests/test_owimportdocuments.py | 30 ++++++++++++------ 8 files changed, 23 insertions(+), 12 deletions(-) rename orangecontrib/text/{widgets => }/tests/data/documents/corrupted/sample_pdf_corrupted.pdf (62%) create mode 100644 orangecontrib/text/tests/data/documents/good/minimal-document.pdf rename orangecontrib/text/{widgets => }/tests/data/documents/good/sample_docx.docx (100%) rename orangecontrib/text/{widgets => }/tests/data/documents/good/sample_odt.odt (100%) rename orangecontrib/text/{widgets => }/tests/data/documents/good/sample_pdf.pdf (100%) rename "orangecontrib/text/widgets/tests/data/documents/good/sample_txt_\305\276.txt" => "orangecontrib/text/tests/data/documents/good/sample_txt_\305\276.txt" (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 9469f3249..f055a3756 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,12 @@ recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata recursive-include orangecontrib/text/models *.ftz recursive-include orangecontrib/text/sentiment *.txt -recursive-include orangecontrib/text/tests *.txt *.json *.pkl *.udpipe +recursive-include orangecontrib/text/tests *.txt *.json +recursive-include orangecontrib/text/tests/data * recursive-include orangecontrib/text/tutorials *.ows recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai recursive-include orangecontrib/text/widgets/resources *.js *.css *.html -recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu *.csv *.tab *.tab.metadata +recursive-include orangecontrib/text/widgets/tests/data *.txt *.conllu *.csv *.tab *.tab.metadata include orangecontrib/text/widgets/tests/bow-test recursive-include scripts *.sh *.py diff --git a/orangecontrib/text/widgets/tests/data/documents/corrupted/sample_pdf_corrupted.pdf b/orangecontrib/text/tests/data/documents/corrupted/sample_pdf_corrupted.pdf similarity index 62% rename from orangecontrib/text/widgets/tests/data/documents/corrupted/sample_pdf_corrupted.pdf rename to orangecontrib/text/tests/data/documents/corrupted/sample_pdf_corrupted.pdf index 1dfbf06e6ac2fa19b1faba00d0c365183e5f3315..c0d513236277f3258bc516135b8e4edf3845cabe 100644 GIT binary patch delta 105 zcmV-v0G9usI_EZ!9+4ofvCw`2lTZTglRyLRvoHj10h6Ew?z4~vegTsZ2yl~-31qYO z3AzQ7vJcd=h!DO3vq%y?0ken`ZUK|{6o`@*wvjOyv(Oes0kaqwjscUX8Sb-~8Ug{6 LoEv(x7#s%y2x2Kj delta 561 zcmV-10?z&CHlI3>9s*2dkshuBZe+2{egOiR@RLdc?gH~SlQ;wJ0#4$yCIoH)0%L-c zlm+erYlO3c27UnoUZIi*aDTw)d(T<%H7g!xyrdQTt>|Qo99_2Jql}ecYLXQjm;hMu z9mdTvdIO_ev%vi1)Q&JlKjWQaba~N7PC;Ng=a`{6<|fOy6q;wz#|=xzO^ZiLJ@#Bi z{}e~zL2IoUn#$SL+*Ixv*>gjaxdd%3=B9NGuJu=&#g#@B%`UT8U4M_YXtr7Knzx!R z++c6l9P6-G{>#nl-dbRGKMats;=ak*>80j|hq$ zlJ=f7C>MoUn^RJ>)2cb9oVzffe9wOav`4Eb3T19&b98cLVQmU!Ze(v_Y6=QAATS_r zVrmLCI5s#6Wo~3|VrmKsHy|(|Z(?c+JUlN{d2nSfMsIF(L}hbha%pgMZ*nh2Z*Fu> zVQm3rlT;M8ku4gtA{9mf0#tIdXcmqE0xwCE_80B~JUp}H7y<#4^e^Qv^a+Nr{8 zN{G9FGm(xPx~}@SLaLv+K8&V4y@CmEI_tZ7fmHAL-qPg*Gg`A1H7<>DXrw*K@Fdb4f819EH7p zL8{?9W1>evzrqZAH0d(5LBlq$xG7~^35dWir{!?n1xJT|bVL3HIi(E|u1hZb;ooA* zm>KR0hSW-M9rg2=XsHTIAsMuWi?L)^IKa_6AZMNQWMqb;=CBD*WN9h^uhkzl;twz2 zB*Ueq2E>`_hF|-8FvTHnB?QwPj8Or}s6uR1MrgF=x%z!tOs+K&I|;t)EpG{1yGC&= zKsggpWJF|(ULbLCpS|Q^D{h5yQx38Zn35Ojf|N2ULW(965hE2IF##^T#Y{!r&O|jF ztEjnr`3n{cAI)}GoTF^4jfGk4Bw-4peF|Z{U2--d7D!9U+s<&a~e@)RBXyHbA${hy^9(+aAfqrFNC)lb`|eRy*d1LE#(pEa_b zZO~8HzfC9@h`bfp*(qJO=f?Tqf9mt=t@^5_?t05;8`Hj$R%xxAKv~Hck-?o7NP^o} zBV#jF^#$Mvs>-GYKurzKOij&AgqqUPT#0*T&rFm;Hnu*z0XX$r3*cxTA90x}J23Q2 zE7uPQ6YtsxT;~X|*7m;c@usN>cvV&9^NW18Hw7Qh;MmFrPQd_dg4;)cE<)u9E>3T5 zEls`JYWj1EC}1rHZg6;bX!ym9OJD`>)WXIL2#g}5Q!VgfPo0sz2DpSXRV7%X_p2JH zHaD@noRAwbJUAG%KC~0E-oLJt7K^xNcx?r$0NVa1Yg_Q-4?h+dMaEjt&qXw50#u%z z>Gi{3-Rb^;<-s9L5D(@x&CGy4I(tkz1Lt_=JzK{=8PPlfqAph{X6p@SEjHnAQ~E*0RU2o4^uTcHPHabb?^8AWNvhD_U~jj zWGilDCH8>+OzeOXkresc`sCKoZg&8mU$8V){U>PtEQU8HV+DXMZcf1C5?{CvWpu*=u6w74XL}HJ9@aod=C*)dpMp=0jE>-d zI@#L-etf@fU+h9A#=vRXne9L_f~hM8L;e-L>4Iqf`>liSEiWKv^*xqxuzeq+r~L9@ z_76<%0GpoQ$Ump=4P%skbMeV$ZkJb-*MSycf4rmhCCiBELs>3#Qk%QQi@ z;=lN&UIVi>0ND46zS(N~fj@HN5B=xmc0UjJ%bivr$1!rdj;Hy`-S->B* z8fa{5WBSM6+f+w(2TTyf#_~1Sw#hA_=>;U6TDvnt^Sd~;_X^8jCOw-Kqv&eS=zFvo z$iRy`N9GQNoRoG8h2LQjN zRg(qW^M}$10GfHN|Ctu_ZY2l5Z=fIZJ`VZ#1gI|j$M6IB-T_$S=r_oN6M)9*SI~zC z04>QMf&S0%6aEmm0pln9!I6)!ec`+O6MlAm+AnkipoYM2pgpwaPtuRr_{8M=#r4iR z`N6c=H~rhl;)(uUOvZKZ9yWWYe?Pm^%Wq%ny1~2pajT(;so8)hrXQ~K-3}s zO{**JuE^zGt=`TLfJW92=x&Dl54sQW%>(_LTHo`(j@mZ=b>#T!PwGw0_#wjz?EODp~wI|2j?~=<3vp>Szd5>fd#z z(Nq(M5fCnk+jjbSbySII0=*3LE}TlWIv(@d?ZkJIEnb^!uB-q4An~?UGoc6DinE~n zj-p&87VW1n4x^f5xk7`-Sw)S^$iq9*PJS`KD@h}s5O)LSz4cx7 zYE{hDDNW8t+CzuOxWqa#M$#O&CRjmp(okQ`cv?@WHKtx24kz}$}HCR?et9uFTP+b5g!iq)0w^Y#}J-Cn!a4X@bcg z%n?hb{qn3+jy4_!4eKz`8|oH0S~4w9{w`z<7;saKkh+qNUIzXMVxGQ6XE%zYw<6Se zI5r0J$&(u%CMbRt!ERW65#RFkG~`VU2S1v5(DIRq&&792`%|QlNCDa^4U*EFbGK~3 zFqPtwC`EqjK&uTSkLDmb;bJ|z7B6xE{-vS)l-l5@ShIK@YgRQ+sKps+n#F4lfrSDM zX=T#g*sso2S&`5z-1jWc(_ilxk&+FiwR{FS_I;;Hjs^LiZ*L^)E~ zmC>E)b40#O>p}>AvoL#D9xwWs2}y-TGz!YsDJ&4NCkhN2NtE|ZA2@&mzZM&K-Fh?S z#)2@Od0y?XjOjHspCWoA42zXs4`K*ndFl3nvWuO->!ehyb=*(RPWfX^_582H3mYIv zn-QPzI;ld^v^}_AyB2yuOoUf`xRuu89YVD}(MKxdBD@wqCQbla;UDW$gJC2*?gTKD z)OnMdadbw5?io_kO&d>hKzs>)R!G&(v(*|yUFc9%h;#P~0i_!0a1kpULYXUG$I>Gi zPpRYbKj2gly(0U!8mkJ^q~=M}@SP zl|U@zo@89`W@jX4k;rsK8UN(J^U6r*@$0$*RnhFS~eu`5guHv#Uv7b8zEBXDIevK5F>79 z@S?<3@5-47%(7`5QWvT@bYDP_E~5b2%FWFD$~xX!gX6^OwFnpP)9~*v|Lay=k)8F@ zc|Sm$?tRZroWG#&&IP?>Z@GS`a^OO~ILXkFVmbcnrKbKEyxnV#CfnB`r{h>o;9VPE zIH<+>Fuon;jZ5z-_ge#gH^Vtmivs9NACt1{1 zlcwUvO$1rL%5?on!I^P~=SI3>Io)JshiNJ$)j2vCHbGWGvIW~M*L8?Fcc$>P4FJf~ zvo*%U%AI1`U>V~+RDA{UV=l%H?jIucqQ`T~+B!7bj>x?Aq_1@zrTEdINJXaq?iZa> z6ZM2PMtYbwzyFQXyS>EUI^|ehfJV6)i+%h34H2r!|6(D(VO==Is=kD7KS6<`t<;f@ z4UT46@12Nu$CWUiDX(ge%!TuLTW8rY1QQ7f*!%6rL24{}wJ`2{*Hg;Vo!f>BQFKxL z!N?hWUEkg^Znvkf(+F3_=7p9}{}FNF7Ef;WD~*Kqw>RLL_e-dP(`P*o{nggtbOGy+ z22wE#iag#_vD>+z?uZM|cQcvx#Ai+NErh&9Jegyd+flph$+*;gzTb?wI3(~LUpB1N z&bG}89`;B_ep`}Pyw3j7SP%MI87sD4h`NSpuaVSnM;e^)uff+bMh!c?C2z#=K?nI{ zO}5lKus0rmL8Vkz0G{!^Ck^}39_4{oj0t)H8Fi>vHA~J>NSb8Np~8{nk%Lyrt{*GkvTvPV&8#lc#0px_%3g&kxSuqFjRWf4)%VtE5NpuOwG*b9A-hBo zKfpbCMMh*_y8e2noYCA{6LKrq~KT%db-92lS3ND^YzIQf?0>2gX8j1WvPYFWyqSha2Q z@WT|5B8s+H4`qD~n{^Fbj`$pp+*x}keC}4_bd6)SmLe1wloAIt9Tg9_(lVbSV zA>a!~$u2m5txgGUaP>V5^AWc6D=%Jl7()6oMQ!~V*uchsA3!#ATa<4l$|XUsJSS|UfehaWM_2zyt< zz83CQS)-CPP85ZkxbD_YIwJ&3KEA&`O2WahZJqFW^dax4r}IIKXBpX4C7~dnk=ABt7e|kid(MxK_=x}Tn*H9 zDn7uO+E=W`^OA%KQcX27u6@8rv4=Ef?`+nzV?2A8J!I|alBR4On&ak@{?zgPl`eA;CS3~C=hA}i`+b}k zxV-J}3b~>iR`9z;_3x{4<>Y%4A7BdgBzc!197v9gP9F2p&l+UD0yX#T=OorJ8VN17 z!Qcwh=%OgcUTTuD)LV|3Ii8n_?(zi~Gaqaf7IC$Cm8&@Geb>jcl=+d}Z)DTAS-Hd; z?6O#48bEEC(6Vj^2VK` zD+z*|_ctTpS5#xBb5`}$4bI3pEJYHcs9f{Lr0NwsMlo@`f2{HnzITGJD)USfOnZyc z+X+l6?AYiNuiuh)8Jb2qVahQ7+#u08{QE?!%WI=^GkJwPw`MM=+gKX+)5mJ0*ke@v zhGJ@lc$4bIXL)Y`=$ZPWll1zQN!|u^_0vdT&r0wc>* zZ;7x2fbLH{b(Zpcv|sKY^}vpOMXuzW7F|(P!?YvCzVdlXA_g&Qj!-ef$yAL9*q?Qd z?_3b07|9;Ftkj7J(W-3cAg^Bc>i88B7D6a;{IzcdM;Qq6vN8Cyl1rH19M@e1mvJO& zfYMO;Ae%Lx>E1&q+Rte)kKkiXMf8V&R8=btgDOyO&Ea!hopphRYA)GFPqqw!7K1Ki zLJz}H+OcIhy)0VEPV_!jsrb;dWQN^K7F86I24vp-X|kG;IZ2hgsg*-{AS|!a5x&SF zMZ(OAIu@J^BXMuWZ2?I61t2V?b<=uxm7~PNfF&<5(f5UC8v3eq5y?_D+cP%#RfmpY ztGDm7%kA5DU8tyHz%-K2^ok@WClE(@HegC%{=!CWuIwOsaaRMir~H!h;3Gz12^&X| zkv|Lyu&LK*0*}u zmB0bWxL`+6GCI1sMWg~W^_SSyXr)Ql^HBtKC7 zELdWf&VIa``UFL;*GvTlQf-pFcRn`=f>5DECPdqumQ1l6sS^llW&`uBJZ6WgOZj`< zbODko?ivt!(o|N!nGAi=U>TyI_?w*P0;|;Xy1zg!Vdgc2>TP)V0axzpHQ7$?n!Qiu zbZobTh>ib#;?&*8*tGso-g;@Zd%q2lbK<`U%YLNKr4^0`5Gu_DF4rg0COfJf)<)Zo zc7h*K>9N(`SAh?7&Lq8JMXSc+*0j5LuY4{1 zZO4@|?4E2|79%Lm6E>2Zjzuqp)WF%7Y7L6fBcuEon~u`EjlZgcvDEdgkjE;$fzC!C zp+Mf*4E4a~A2#YP143PH&FFE4*jKg2w-0YxHy$V3*=LY-4&Fi1wx5*-RzC=$84=F0 z%~VN`W3I{FwgdvdN@uhXTW;wj)e=X$?#hw4_egYjNN}w=9lEUexPr%q>0Om*cf^!5 z?b~LhX+Y&K=x7+4BIvdminYlmwsY|N8c_p{W${WzcySIkU2CLhJ!&i<@uRp7VgbmZ z+`^{EkYkk|--d8d?L+f|!D$t`9pm86j!1RV2nVaZ%+u~v1#_#tdD3Z-&&ifCJmVKMu%(Gu#I^fdfFt!)hnnB?I>nSdV?j{b1Em$t{MaxH8X?tJqFc0F**mosU64%Q zqpc4qXWYTPov%Qiocqx!pX7II+2z(-`#b(}*w9aBfVd>d0bqIM2JR!i2d(QV6v^9) z>nXa^rimxhdtBy_}c%X=y9 znWUft!=Jfd`zhQv8d#>=+XEoK)XiLd%ixgf-j7JD*;QMh9LiV_ zlKJ8(FNvFeL|zUKjh8=j6deLM@?5O4JyQPOtL6{Wo*dQG_G@#^zU2QQksHqYTE*0* zSz%#?I8wK@QjeB#Ef9;)sZkIxcf#vw>S@Ah)qsqCt45#B{jOODuKd|=-CW+~bg&KV zyV)r?nW-*^XzLbq<1wRjPG9(r`S@2Krjsn|ND!_${T!=Q{b#IIG$I0-XMo)0J48LV zxc`3rR!gc<#B&VZmt;T{RC7%?7xe?oZ@2Rb4V-V|VcgHO?$CRD6a3pZ@Kk#+orJ;&h=L?w79zVxyaE97d&dC9vK8}2J*DR~ z!X<+SHC9j=P(HGRLn)S8sTuLd30wJT(X@h0eI0}YZGWGqcvU}rXePs2aQiI>Oh40b zGx)q^#;F4f2I#usCch1bYwqdgk&Fa%4go|Nb-Z@_aIChV1V>xsIfNAuQyOxK=+WECH%%OFKs%11m5x;&EP)2cK<{`g?_pNY^H{{~G9-o>%8vDCj5Z z)5PpbrJ)$=J-Al#>_%&fOO#WVZw9xzu+c5*fPC6wC9}PbgjeG9l79IhVL4I95F@wd z0JKks_;bkj`G(;j+j=fx$L$wVi0(G0=64`J(dEuGh4=+@go7qNTTVec8h&m>+D~#m zl2&bN16kR8@w7Y$ZdVhH(TIa^^F8doO1-P7OiUF07hO>$98(KVW|w&ld*jT5z;~Uk z>*!pQOeH*?TY5`s4ids4pth&GKo~A+QWLIOc(KHZ&T`?QE{O(;T+C$_8G#iYv7E&e z66R6ZmhjQAq)QERxY+{@x1JqLxY=UM*TJI#J|>sKw5QKice! ztPlIqzJ@|t>7~%Ep+_p_;@08iYEBS`K|%vv{eK1P3AH)Eu)KC*U z8&=aWcw529YJHb@fw&sAB;-M z7`!YpYk%hJhuzZ&4ffQyMH{goW#3!m|5azhmT!(#Z-c{Krx2Bv3nP4B7B6*L50a^r~V1(y@Z#52y%ker^Mt73>c zO=dusZz5@H)BNs1KietqL_gKT>M1wg$*4R!K(ck6&u04%w@SUcib02!x;dn^{c1vUFXgNuRGNOMN>;%HJ);RC8c^nu%Aftq)D#xF{Z;S|7YGIRx&F#=T@6<~^ zIGbVN;(S*J6O9oq>c=sMi7;;n0D0K!qI({AJC^+u-36hZD?uB;VY~b5$fSf94aP0r*UFMXGnv4+2HNP9 z$>fXVm%B&Ir|(2l-418Y?gHJ1xL-Q3n3(%9dfvc{`^+ z^dtz=;?8QPR)D2&ebW6hj2B>&ay7yfVHeB^D7>|pbLg+!79cIi8`P z3Ys;Tm=JC*4|_Ww;`|EfmJ3wYNAFPD@*+u4m8Pl6{aeFaO!&(D5Mmwa6!gc3${Q<^ zd1sPTb1rMGXSS_i(FJ<}?0(^BCN1Byrt0c?7owqSU%fef!>gZ%m!A*82WwkqdGNmn zhM>A8pc3qO3bTg2^GL`4EVYjI;A0{hb1ca9X5BjjtCNw&e;m4rSSad6S7V6hP%|cj z2WfT!OrRaI6nrH&7MLzAu<}BNA&f&Sw&0guD~8I87?ITC!JalSqg?Vr8HZ*oP$ft<;~V zhnUVQ^iU~P#0FV0ed*U_Yb^sEh0e*JB7+~xj#dCmWQ#nrI8(9?;kUg6YlTgczC{yh~LkRwLE^Knd&s*4v2qG!zG)nM1`xCxolY|w~u*vrB@xH zm}q-(`zO-3r(@ zU5M1|>X_j@?W!q{pcrDt-THPZ?D>2y&X33~A7CckRfRS58Y$56H{cW6M#+Cg5moOM zJeY+6J~qBs2V+lH#CotdGz+(Ld$%-k&KXof9ST6O?Cnm4lXc`Zoh`N9s|J|9Uu7>_ zXpxT#PU)P-#CwN1ViBx^s718wRpdJ=cKEbM01XwEKT{KYl_)y|J!|vR_V>5v*PcuF zXwRsMYPmdSLh179k>{w6+B4$j(NQHEEemi?p4M~&QJrc{YVL70#i3Wkpc7%fE;Lpt zQeqz@O&V5B)&rmeZ^nAS5SV5u=wu9OyU9 zw9sVUNOL%33<8(dTZjItT`XiazpELZgqbS=yw1nU1)J;r zrs-ca=;13#ztA-?EqM2N;|BAF+;(>8$Pt-VYsMNFt|*$0^R3~m8BSKekv0K%KlTKC zZJxWvspco;Ad86po_9d~mx$jP+72q!Pwbp`3H6JHG;~uh>UcuIz;jHb;1n^l?NSnG z*wXoZFzVD!ebkRKq8Er*VoM6v?Ih3`^{p<;XBTira#x&7PYw@D*flJq=a5O;c-kqG zt2G`is)ZLT?|i&Mc(I>e%Ca7H%Y?Qw*Z?DjNJ?f37J_x#?^HMsIy896 zW!arV{Bfb@vgEX)5vek@y&bwxt=X8#ny9oG8rQk_*PJXGgIxM_4Cr^1>G zkG^*ZmGgeED|p1H51$1bFwIH@33U@qd*C+rLD^np4(0w#zLz-eg3M%C{FAisxschx zt{6w+@u&yJDWbdlmBnN{YXea0R%R5F^LiHjXncw<`Ryw9a+z!Jq?R0 zsZ3G>@fLqi9KH{MqtG!YD4)ZdF;cIU;N)>PGluY($G3*6eh~QO)AyCT?{k8E?ul!e z9NI$;mk>vZa%AKd(|ILj=@F@msVJFM74_DGg4|6?(8E0 zLz_3cz~G5m#0#2nI||>cEAJ=bK(ufTmK5>f>5OS%L^46&K004GGdxKTlKj`)paB=Y z2y?6^naH!ye7yHS6b9#NM@$8axNa@1rd`lzv;48ajQQ~^a}yO<7!zgV$t9#=%4V<( zCltk)%E#Pq0k9I3_zfI=)&w-)a?;i1eErZbLw-r*L5G*LsYnH#sHs!K+o2U6bEM*6%(X1YhC#KHNu`6X)9q98_1`F#tU2=sO!U9pKx) zoE2mSW&KVelpb}}@r~hH^E{bTB4l#Al2+Q01db@PatDGoOovfs!&X|zQ1-3J9R#9D zVOl{g5I#sXPqh|Gn~_sbM7$^q3m#cG5CHbTV!gS5{80dofmrF5T305ENN6)di`gj4 zfA<_J#=9g_B9M4H5$m&K?k6DVk$E>Y(#RRBUeRIW76s$@24&bdr*55CM()h0Kdwkx zKo(&yYtW>l?VF4t9bdNoq%x;$JHlU?75u5%U`0pRSw-T*qnC#mE6hIwY7#b z7H`}ykIee{f~hLh%k(*!TmRD{M}}RMxA$%vbGSC}k@)Q_vAkLghBnyd&HpNxq3=`? zZ9xdufxRlEM+8~E6~0vplA(cr->6SlJSDBl&(ZB&eipA+t$fdyE0m5=9q<}(Hr3cd zr^+p~Kew(n;{wPH}vZy?$9xQV0RYcxWtNfJ5vtX}%JA|@iuu%l8M37hVgHF9Di zd}`E-?5LjA=aOw;JU~Pf!iO*o&Udqgka6j7RyHQAR(l}eNam!?fp@4ppmzJFL!g%u z5<%%jS}zQv*{6ekeiA(*EHP$wP&t#bf+(CZ@hhQUahoBCva|=%}MXh~E;@v(VWniy)OKKQ7bVVJ@!_d^SeOgxC&bvsxFw7b0}q@SGpZ zZJ>n{82miP2J2Rs#e#^x^1b_#tPNcOV}5#ZF5Aj46l=wTFkSgEEUc}2rLX2PL2(*_ zl!WGdSeq>s0?L=CuHS?Qp1spMEeAcvcOj$N*fvq7}L-nmcGY?HlwsuJuYq`J4OLX-UZfQ{a+%g&y|sgtVe zIt(1`xvyi(ft85=imrctMq==OQ8PYU5n6tKS`9es=&xHqihsuDQKx-3iB#ZDyHeaY zq*(nPISg#RXMp?!@%R*e?Qtu;6q|(kxBy$3{eB@E#sb{ruxgqS4?j_t`SBph?;!YS zRO{d6C_KK9k05)K_R;-#jT9b$ALW^H#8I<9xst>XLRyiHO`ro^@FK%5ko3YE&NNxL z)PFjC=v-N2hEsM5y>@&t`%uB90E*4^d0HAjL{*w&D!&lUY#R!XqRmqgVF~7UN*;gZ zN+0!txGXb3%zQ2lsBp(7ckxP1b1m{m0Q<2XlFm)@{_-U-kI%3ZgnsMFtyM@=3yQyg zhUFlUH`jwAGl0XeXQjV7ayie@D^bEKM%NdTY)50htSC9e&(pMG8*7s+XVumkDq`vx zZ3MlM;{**Cc~Y~0^E#ng2+?U7m*j40Qc2Hlz5b;EGKV-ZnM$sAR-5*|YEf#G4GF`T z>TIqrlXZ;&tfNQLN%>rRQvlgKhXyC+ENLbEJ|mIz*?S4J3$I8osgb$7`F5<|m}@j{ zsmW>4YL^ES#_&1LKoI#oY!E3J%la7pD_83!rIrW#hrsxFIDu)06v}IrFbWoiLMb*d;O4Z z2pUehvVUlSJ@BSF$gzvGb_V^aM!y!qN+F`v2@9>$X06=_>fE`-_-F5`nqU-WeiD|)ZngXjMma?>n@De? zS!Cp;v?N%$_z2(HK6f%pc$z<~sXDfAG7m4$h|}ox0*xyDXkvx0Wz?oCBnf7FC$on# zZq&8wj6uley7J9yZET_IMF=VVsxrYU;LVpu^tbeBl|X~`b$rrogl%iU!cKR?e; zllA9EK>&Gk1tvnYW)D%Ng9rOJuctc2PK&@)Z=$lnK-hS5|zZw=YaBtHj>OIO+9R=aqaGqzqpT6NJoOaow^SP5YyoaF{43vhS^*dIGV; z4i&BB`+K4x9(Rh(ED>2L5M$MUb8-tuDfr;=ff*hn0yR8DL ze#FPz4qOArE5p_XR94_^N#}LUb_NCohz5x7KqeCt$ljK3512|BsQ}`HYapqS%N zhu_S**w!Q^aAx4yEi1zI*||uOwO%+Op_9V|UbD}2Sb`nI{T<8Gs92QeaPk5a@tA=fj(+@$=e>?V_*PD4B; ztH{qLvZaqJ7mhSe@i-OYL65lTOs8%ox%=T+TZaHZ-8v+o$C?Ovq%hx6^hidP=d;4p z2g9f&1agbV8Cy&|tXFOep@nF8&2A+#;eloej5`55=5v*75%b2R$z_Q zu+d$AQRHVzZ%~=Ac10sQtuH{b}Oj z6ua1SHA=mZcvQATt3-Ya~EX2PI%Wk@LsW1#k08y zuq0B83IqGR;Fl2S#J|-0z!S|$gk#Xx8TOvupYFk1*N4z4@?4k91aWI&z34+jvkULS zO^bRAR*$Zg0*x=8xJ#PfCi?F0(7JDNQ`X=ZiB(029MoSPZYLNd@00DH#i=)~NjW68 z(v2;d=97}N6w%5i*dy7<1vKzO;6Ju$TP3s?5r#vfNXJI*&X}e%TM+b^T!`a>*jDzm zkM(cGlZm!%v}(;Jz2WtuHy zt0KO$kVj(c>HjQ~>s|Y3H~afyNe+4PeX9oyW1^1tApnW9$rl&R0<{l(=%K|{p0<3qQ69_j@9Ooz+-F%@lAPuG+U@eA3VyR7381{IH>?oS@_GfpK zQmGK(4S@$KQe|7~WKO@b4%aY9OM#v#dNw8Q(}W{xNLO3-1f@}@ws>!bBsk7wq#o~? zO$#M1*A^MaikFV!uT1^9?aCvVtpwG^Zn4j2z!I&ukKrG8_O8Dl1NQ1T?S($1kj%33 zW{gkGid>=cOJjCe`L>LlxB=*Dcvgr9Ih7>(zrL*#-Mhf9)_t7^p-kmghN$j=bCJ~X zi(q+h@~~^K%niQ(0?npeL~>ft%3J!Pt)vVpkhll!ZmUwI=1V9V9TFA^UBz+T_q#E$F@PZ=pddIQc?`kZvxL6HbyW2~lv z@JAe`f;LLl@A-xB$Yx0_Ee%Y#k2!aCGPqN)Snf?9Zxr$rAkOk>X#|OaQ*+Kv8I|v@ z#0>mMF;l?`5nr0@F(9<@gjUT*n-4eQbP5EJliHjh(`IYeovMAW&`kPSclZM`;7jT7 z3nnw^F*m7TdEmlxZR@M=z>m=p42$CZ)kzy*LtFQdg`ic9kU?jj)^`ot?fFU{kC#p0 zJ91siybeg&)DT#HyJQ7RTDRt=Z;;bJhTAMYkG((6WP zQ9SPDJ#>2tpiPz?ONGe+5k3gJo_LYYPScDR5E|+_&)?K(_~=M@5TV|NCfo>PHsIhN zR(n(>sC3B$byau|Pw$v=Y{KEOtM>MJwY5Unj4Cl0M%7s+Fg^Mb3`D)Q^ph`3iqk>L zqtqDxQh3@wKV1HqKi4z{KM7)BLXTP}eM0Gu2C4X=*55VyyTmqUGJi(<-5p${8 z1D_Ik#J)u>gE$v9`>5|aBOU*j+Ao^z)<88dz(K^% z=7M+zi9Pu_`i(`I&K*_qPb;LjXpbCDCpuGIidu@8n6Epj?F;+rg%n-U&_=KbmrOoI z3mZh;Z)&k~!Tg-jo9l;6_;S8O*X`uLIKX95i3bRBogREQ^R%l6al*(nF|Zy(|0KW< z?!zG7uc_&n{RPJ-UaI>}lwK8eA9WS5Z-kdf_=7iy3|Hqun=HAULgz~Ti7<`%PWvQ~ zBBI`fesRQoTixLn8yr(G12PMG?2n|(k$)QWj#X1ZHf3i1i0_l%X?yv^%npv{-KecS?>M49`?ZfcOhi(xl*C1mMiXP*!%EwFMJ+EhlUEhFtdld^J#){wd;j)K5p zK`iJf;Egkpz4l&Si9~G7F^(XXe9vuTrabrqP7@Bf8P#%)1 z0P|qzy>Pd^K1RdU(}qTE=xim4)g+lAMm`L4zpg%x*F%`Y1N@&cN_*=)$Wy^!4roC5 z7ZPLxNu4v~8Q@iBgHu7X*h8K%94%1YpaH_D;0q`ZIcwZe&%^e^Ib4)GqAYgA{$2w8 z;=`oUDMzRS34@3Imy|#|?2qCi>!|?bfje9!c~ChCUSS6zE@D{Tc~T&bq;L*I4n;w~ zd_t;mbgz-Dx4HNu6>xq+zgRxVa~Jn=j`JZ;s(t6eqS+WIU%RELQ!!CKcB+?l#fX1f zGJNcj4F!4Px?S9UH;+Y;n>^(YZ1NeqCz4UMo(m7wIWA|9Mdp0+To0ZL^|MKGJ~>ZY zr;_wOIZgixHcln~qw3XjA!Ig*&NqkY<}n#Ew^ZhX$F%V>yIsHMv-r?Haabtei}2tjx4bbZq}^?EhyXfsCb_DZxLh7%2&?9SmJ8 zoJ|c0-2Ur9Hacdye{=c=i~kqa%lf~)0~JpPQv!N|O^H$7wA*BW@qJala~woi?06$0polMG=6VVwl57PO#M9As=puD9;}(8= zEhI6P>-ftn5kEF4d=Y+x004|C2qQzKfG~nXLXs#9LbczAQTIpMp6l7cm(Qy2v}y^D zE~8`BbvJ+@v^Y$9o3)f}6?a~9u#m38W~Zy4j6k?dD0dVQk(pmnJ5ECjs+0uG$JzK@ z`5^clh2c0dk^nv)Ja>~!{~V7k`I8M)=du;Ah0J2`a7|tSVjtS!lmX($yFGl8x1}?> zeKu@c^MK(GhXjT6R5dG7Pap;47eqlkwg}Pp+JVTB4;Udc1Ul|L0!kFvuR&uyaA&xT z2nJTvAv=c_$@lJ46OmcjXjl)wu7tsOQF2cgS5#k?_z*!}<3j>*eInOXxEn-12Cpc^ zc`_V84PhW`cOmtt|^>v8FPJ9QukE0P`caIno@B$tCCMnH@dAX2!dtryJmGtNJx`C zZ;y8j`~?UchrIf~xaEy%{$Cjp0&O04MouIEVR$)eVVNNktW+6dN zK?V_FL3RdV5fKJHg8#Y6zfu1Cpa$muau9<5nRYieFnF6C>YIQVf{BLtg85%_nALi5 zosi3BCPQK6%Sr;XXI?xd=(5@H)-(eHD=lLNhGI4rH_U)3E=epZsVD*lg`t@la6`VT Js;j>n7XV!=Y7PJZ literal 0 HcmV?d00001 diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_docx.docx b/orangecontrib/text/tests/data/documents/good/sample_docx.docx similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_docx.docx rename to orangecontrib/text/tests/data/documents/good/sample_docx.docx diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_odt.odt b/orangecontrib/text/tests/data/documents/good/sample_odt.odt similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_odt.odt rename to orangecontrib/text/tests/data/documents/good/sample_odt.odt diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_pdf.pdf b/orangecontrib/text/tests/data/documents/good/sample_pdf.pdf similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_pdf.pdf rename to orangecontrib/text/tests/data/documents/good/sample_pdf.pdf diff --git "a/orangecontrib/text/widgets/tests/data/documents/good/sample_txt_\305\276.txt" "b/orangecontrib/text/tests/data/documents/good/sample_txt_\305\276.txt" similarity index 100% rename from "orangecontrib/text/widgets/tests/data/documents/good/sample_txt_\305\276.txt" rename to "orangecontrib/text/tests/data/documents/good/sample_txt_\305\276.txt" diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py index 2930747cb..d4997039b 100644 --- a/orangecontrib/text/widgets/tests/test_owimportdocuments.py +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -8,20 +8,25 @@ from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments +DATA_PATH = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "tests", "data", "documents") +) + + class TestOWImportDocuments(WidgetTest): def setUp(self) -> None: self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) - path = os.path.join(os.path.dirname(__file__), "data/documents") + path = os.path.join(os.path.dirname(__file__), DATA_PATH) self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() def test_current_path(self): - path = os.path.join(os.path.dirname(__file__), "data/documents") + path = os.path.join(os.path.dirname(__file__), DATA_PATH) self.assertEqual(path, self.widget.currentPath) def test_no_skipped(self): - path = os.path.join(os.path.dirname(__file__), "data/documents", "good") + path = os.path.join(DATA_PATH, "good") self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() @@ -29,16 +34,23 @@ def test_no_skipped(self): def test_output(self): output = self.get_output(self.widget.Outputs.data) - self.assertEqual(4, len(output)) + self.assertEqual(5, len(output)) self.assertEqual(3, len(output.domain.metas)) names = output.get_column("name") self.assertListEqual( # ž in sample_text_ž must be unicode char 0x17E not decomposed # 0x7A + 0x30C as it is in file name - ["sample_docx", "sample_odt", "sample_pdf", "sample_txt_ž"], + [ + "minimal-document", + "sample_docx", + "sample_odt", + "sample_pdf", + "sample_txt_ž", + ], sorted(names.tolist()), ) - texts = output.get_column("content") + # skip first document - it contains different text + texts = output.get_column("content")[1:] self.assertListEqual( # ž in sample_text_ž must be unicode char 0x17E not decomposed # 0x7A + 0x30C as it is in file name @@ -99,9 +111,7 @@ def test_conllu_cb(self): self.assertEqual(len(corpus.domain.metas), 4) def test_info_box(self): - self.assertEqual( - "4 documents, 1 skipped", self.widget.info_area.text() - ) + self.assertEqual("5 documents, 1 skipped", self.widget.info_area.text()) # empty widget self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) @@ -124,7 +134,7 @@ def tests_context(self): # change default to something else to see if language is changed self.widget.language = "Slovenian" - path = os.path.join(os.path.dirname(__file__), "data/documents", "good") + path = os.path.join(DATA_PATH, "good") self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() From 1cb57344443122d2dd4d52b6745ddb2298991d6d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Tue, 8 Aug 2023 11:47:49 +0200 Subject: [PATCH 2/2] ImportDocuments - Replace pdfminer3k with pypdf --- orangecontrib/text/import_documents.py | 30 +++-------------- .../text/tests/test_import_documents.py | 32 +++++++++++++++++++ requirements.txt | 2 +- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 0e10eba05..e18a9e754 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -25,10 +25,7 @@ from Orange.data.util import get_unique_names from Orange.misc.utils.embedder_utils import get_proxies from Orange.util import Registry, dummy_callback -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTTextBox, LTTextLine -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfparser import PDFDocument, PDFParser +from pypdf import PdfReader as PyPDFReader from requests.exceptions import ConnectionError from orangecontrib.text.corpus import Corpus @@ -130,28 +127,9 @@ class PdfReader(Reader): ext = [".pdf"] def read_file(self): - with open(self.path, 'rb') as f: - parser = PDFParser(f) - doc = PDFDocument() - parser.set_document(doc) - doc.set_parser(parser) - doc.initialize('') - rsrcmgr = PDFResourceManager() - laparams = LAParams() - laparams.char_margin = 0.1 - laparams.word_margin = 1.0 - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - extracted_text = [] - - for page in doc.get_pages(): - interpreter.process_page(page) - layout = device.get_result() - for lt_obj in layout: - if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, - LTTextLine): - extracted_text.append(lt_obj.get_text()) - self.content = ' '.join(extracted_text).replace('\x00', '') + reader = PyPDFReader(self.path) + texts = [page.extract_text() for page in reader.pages] + self.content = " ".join(texts) class XmlReader(Reader): diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 4aaf96402..3032fe331 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -16,6 +16,7 @@ TxtReader, TextData, XmlReader, + PdfReader, ) @@ -296,5 +297,36 @@ def test_error(self): os.remove(fp.name) +DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents") + + +class TestPdfReader(unittest.TestCase): + def test_file(self): + reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf")) + res = reader.read()[0] + exp = ( + "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam " + "nonumy eirmod" + ) + self.assertTrue(res.content.startswith(exp)) + + path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf") + reader = PdfReader(path) + res = reader.read()[0] + self.assertEqual("This is a test pdf file", res.content) + self.assertEqual("sample_pdf", res.name) + self.assertEqual(os.path.join(path), res.path) + self.assertListEqual([".pdf"], res.ext) + self.assertEqual("good", res.category) + + def test_error(self): + reader = PdfReader( + os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf") + ) + res = reader.read() + self.assertIsNone(res[0]) + self.assertEqual("sample_pdf_corrupted.pdf", res[1]) + + if __name__ == "__main__": unittest.main() diff --git a/requirements.txt b/requirements.txt index 8522e6c28..ccdc4fa48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0 orange-canvas-core owlready2 pandas -pdfminer3k>=1.3.1 +pypdf pyqtgraph pyyaml requests