From 5d20d56e48c79faa564a0588997807e904c3c788 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 25 Aug 2020 22:50:31 +0530 Subject: [PATCH] Prevent taking max of an empty set --- camelot/parsers/stream.py | 80 +++++++++++++++++-------------- tests/files/blank.pdf | Bin 4909 -> 0 bytes tests/files/empty.pdf | Bin 0 -> 2353 bytes tests/files/only_page_number.pdf | Bin 0 -> 2437 bytes tests/test_cli.py | 4 +- tests/test_errors.py | 26 ++++++++-- 6 files changed, 67 insertions(+), 43 deletions(-) delete mode 100755 tests/files/blank.pdf create mode 100644 tests/files/empty.pdf create mode 100644 tests/files/only_page_number.pdf diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3749028..39a0464 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -121,6 +121,7 @@ class Stream(BaseParser): row_y = 0 rows = [] temp = [] + for t in text: # is checking for upright necessary? # if t.get_text().strip() and all([obj.upright for obj in t._objs if @@ -131,8 +132,10 @@ class Stream(BaseParser): temp = [] row_y = t.y0 temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # TODO: hacky + if len(rows) > 1: + __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -345,43 +348,46 @@ class Stream(BaseParser): else: # calculate mode of the list of number of elements in # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if len(elements): - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - f"No tables found in table area {table_idx + 1}" + if not len(elements): + cols = [(text_x_min, text_x_max)] + else: + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if len(elements): + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + f"No tables found in table area {table_idx + 1}" + ) + cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] + cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) return cols, rows diff --git a/tests/files/blank.pdf b/tests/files/blank.pdf deleted file mode 100755 index 99540f1ed9ad31edd2a29ea7d5ab3ed081243dd4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4909 zcmai2cTiJX_XR{ss3M>uNDwJf0|E(EO6a{el^W?iNRc8^rAaRmdJ`oCl#T>}&=Cl| z7wI5II#NG;{`}2*GjG1{k2`bj-RJIg&zZIES$nf-K^1rf_=Ev$#f#h906`!K=wfLP zkdOfIYgyR9Jb*%%PjmqMdOof&Aitai+``et=66iZ!V>1_0Tli{K^+FSu)YifMSkBK zz}!9TT%3XY0<41a!T>2L0L4*@H%#T!|qU325W93W(&Jd_U1DO0IX4c)mzi#YsI^Ww5%?0uy2wf?k|A_%n z7hhkLQbH8ehA5tI$+yh(yuRh*bC*SUzz>;&xS6$h((WU1Gnx&C-P!{nKM3)&GO0%^ z7Djefr}h+9?eVhQsJh_Z-CTWqPMo?>!!1BjE z6%;c7;l?5@nl7?7`Pd(i^z1G5=!&(aU8%!lY(HGX-%YxjC7L%bFoV5yAeTz$0Un;2 zd=~zkE@}NKx-m$;Z4?roThSitNjsw?!%ZNF2I}-y$m%}f$|?e42iym`B>PK@QnEV^ zDC>*OR6g&k>6QWy7;S9D(Cg$j&eWoy$(1fbMZqRgpmoT+pLsn%gdq z>-9Iv(Oc;O35(6%yY+HqW}YQx-{#JMVcHbG$p4$Ge`Fx~HyQjlq6-KK{gdeW(E^k$ zAaZ%H?5}3Pl1!}{V;PcHbS@G1Y1GQC=Z_(@bo%iK)Zw>>2)>t!`Hfif6kxfyGQr6% z3dEjer9GY)CZ{{Ba9V{#N3ePvB&y^p@|A-h2S);J_XJ`AhSz0PNUeHf)8ZQvKjRjqeVOu zp*D#!2|kUi()d*w=v7*IXvn){!BHo;f~I^XUzrt>a@2og@X9`DV+~SCKNg#L1EQFz zVN*1OW5ykd9(v($GjolHnY(y79EI~fOA?8e~az}`}mAwP6sU6%x z_!K6yhiPBm+$MCZ7OJq%YFzbO5uL;UVhn;l-nYq=%?EAUAq>ZEB|}(CAYLOS$@>tq zwu!>-x7v%R)3ookyY2C%*V!9n?(=s{P}$T=aj3rBCBW;1kBC1yro5d*K{Ql&Z=_zK z(L~?*^YrqaG)^JCXtw*C?~{mvL2~Sdy;$$>0fuY7ONrk7q&*ZhKW3@=ye?~bgu`(#x;c= zj@8y$q3}`T#N~WDcsn&@c*}!$I+YZ2L9=_j$qB@QKZNHNW>2U4NskY87o>*r4Uli5 z60Mib-Sg4T0sVW%1Ozz*?DP2C=2VCvacY7x za&S4RAc7%_JC4K+&FJ!*u|c}?jNYWBFCFIDYp*{crfSarA$0`uBaosp3+7k||G`5* z&CsIAc#Y+*!aXH1orokB-vYXI1)_Lu>xb^-qEU}z*y9yD$&w=6waMx77HnzWM0H~s zD(SZ5!?BOoEcS$auV%{H42%&HHK)wWldvk0Mv-vIO6Ep^l*W~hB3#2fO_5Hu`5DLbin91TwwQuoy; zif8IPUm|p8F;@K!q)Ir9d&p+SKF8XWSI$|M1X2-`m$56@({AVN;Jk5vD9KPyL|-~4 zXGM$q?ba~MM!?3c4X2Ik3^6NEvO?J4aOH9(%OuV=d#K2ke~q!87(0z1}D;k$USqxrz!uG@T`MX-8PyvCXoXHJ7qHfA#Yo^FqUR z;VD-D^9Ari?4mFWsI(DDgBD%LW&s0n9fuv zASICxkekRF(-B?HStsMDaOh=JF+R?$)^({Qvv4Bo1cyL&GKP2Y--xRC(W=v zNBdX$hn~GSD?XrNtfh>nv=7G*?`J%ZOt%s+q7wq zLr_9y+=|4aGJ6oj2u-p6T1ag{U1jYz&$@LLH@b$D2Ipszo5#KC^C`IMZl~!j^k*F) zoe=2(z`5bUbpEu%wBiiPms%2YC;5)4BziAIGFYNVQc;56xA{r)6MGc$N$Mxk+1yzZ zoHp*ZY&}T9h+~=?v9oYea11|+K9M-^+~(bznSVZdSY;MxmOPg>$9qjNiD!eF>AL?a zZk_*-dAo_>Gl>$3Eg6L7hf|JWj-adQe0FfHjGfH;ml{N8tdM&~%%2_}$wa6sDR(|+ z0b?ZG!fFjqeK=W!L@Igr76)!vG+Ts~2U`SE-3T)b!;v;MNta94nL9SmkQ9>`V`3w^ zlz2Ye+d1p(y5ykou^YKEy;N-y2F`ZNls{bSb7}0J1^I-GI(gN2<%4gJX@9QdEJBS&wjMjU^|hwdv)~c#!J@Ff|gG$ znV}z}7H_ziNIa(18`g*WzcN>DP4G}frW){`6{H#Sa7*dQa(3}g=eOT6Wqhhl}C3alDL}$*0l4)7-}3i;jbTJpT8mFqxaMNCu(1k?;e+mjhn`2CdtQD`cLUohG#dJrGrbJi%ciGbqlLKo3NF8q zgON4+{f$?`_zu;jJ~PO*^Slm}P5Em06Cd0S2YH9^(ev6e#~G-Z>s<9dW`A;j3=DM| zAL;+@veDEa;Iz1a@0jy^{zSbkqmP1!(s@p8#&KFgH0f9quK?flJgqkUUV2>m=^NWO zBN?(8CNONZ0rp z4M2yWFPeOt_CLFR{?=^Myx3yaGSzC-I*ifA^tGwBb+#+Cw|2;MG<8aLe(DnIs_TYy z*Yt?=RQC$^R(%ovQq?EYSKSZpuN@E_sK-iR(Sr{Mn}?u7?Ze8$JtG<;*irq_@v%o^ z^W)azYZHzW+mp{GPo`c>U!9Jep`J;ZWt`2MyFZ7T7nyHdfG+fY)%`lPXtlVxEeQb9Pcn> z_XO~7uJdoE^LJUPX=$$ucLMNh00l1Xrh=Wj2OJ0y`tN0>z%N^_^H0N_tY_*n38Fpc zUPb1R`n4*jy;FIlGUiknk+YS%+^kCOt!lXh2|YiVSQ+%(WLGyx@HmU@_lj-7lcTc5 zytvXzXlb4#O(2kdDgeGN9#&GtptrvSj5xhu(Sh5dguZ68NgWX?IzR%Ec!Zw zwW4i`3K>rgAdgxCWQ zp#Xo)V&JJ{MAbFak6LR9m1LEu(E2t9*DK>g>zRVs=QQmcOq%t)Iv|P9K zHd}CL#mopDy8Zz`?UuO;GG1f~uYRa#uMM%aiQf=j;qaAX1!rvkZ)*5cUW#1SQUBL? z5d{5BBD!|IFrc6SfM419sSEJIU##cXadCkIf3<)v8$QZVps55zP8jsp1rZj63PVK{ zpdc^^EFdVLAPN-{fIN@_{_iP||1^(`bYM?^;aS7Hfu^8K@XLrH&{W_OQSfpIRRaou zg#Wud!4Li^O}5@CiWfjUIjrgD8Z0Oy<7hQPW1gBaNH0KRsNlX%fLpl3z1=UrSV$NG2C%WoYbpT# E1CE0==>Px# diff --git a/tests/files/empty.pdf b/tests/files/empty.pdf new file mode 100644 index 0000000000000000000000000000000000000000..52aeefba1a435332221bb12f6ff24c02aa8bbd75 GIT binary patch literal 2353 zcmb_ePjBl)5Pugq_|BQ-p~^+?IlJpWNo*yq^5UjcG`uJYqEb~P$6lus7`z!4-k@IjEc@Bx^0oImZW9w611ZfAcp^P8ESnH-J0fkqoG<>;3`zxiIV z5J7zQwbJR}3zp8s4Qhg9j0bTdSdQ=@@dfi(z(eM`N+ELQFO~Amiz)fTn389`~c+3jE$^%va<5A9o2@_WsVglf4Db6MW_@Z`&1;V(`Qvuoq zl7WsD5Y-f+g9h`1NRrsO#1oWf*mF#w|svjrRdatMz4-!>v07PgXM_vwAta*cO!< zrs=wMSvn_W<_8NV(2ULFw5$I0)6Xi3!>;R2u zg7ah%3#2#f2GywX5=FVcWNV%;5N$M7FqwyuGxi3xLHMt$-UyL77_Zms#@cA`e2%GY z+nDHB*EK+B#ceA5QcH`E!CyrrzTBy$MWf=>2>24mmzXvP-a7IioIK7%%u^&C`!l{0 zU3ImJLx+%As8bSXG@^z^Y1GsRd|GImGmEy3b~LlM?y>`#_x!;HI5Z#+aB|@FhO2<( zUG;2qfzEDtAvXSz-O!sVuET#SR#2PAvj7FH)6fBoGvr|$-FnbQu1<8Kk+!B=Q(bpV z(=l2vh~*fhgC8KF_@LT{=>XH>Kd2rc4^(+Rg(u;DkuFlcPJ3CF#DOo%vGG?a)tC;J z6*|bxQ)kQ1hQoK~Yq^vUTH+j!2dgEFnRD;|JeQ}ga2V_eXRAD^dLs-lOPJ(VK#?gh z4g)9R`O=4%?e8;;;YF%$2QEb{66RK&*Gt6aki@c+oWxJ(UA5Fg7Wu10s8Fd29d4?T z#O~%$>EXStM`~4xptGTDcsDm(1wGOCbcXiB5{7bDUCU7jr~4aD-_PQ0S06vt)StjY zl05=XT6W2?=g1Z#ZQas!)2zBnHHZo@=)!j*b~~cXS)>p|i6ZCG=0i=(GAtCKt<2D2 zC&>?`TN!P)V4>WX5xog(?}3aOZ|M?3VVyq2Jt8yOZ~HKHSZfb-P3r(Baz9R34nIi{ zp2T+yVncY$IV@qh0TDh=BaURs;pf1)hpsxhXlz#xJrk;WT literal 0 HcmV?d00001 diff --git a/tests/files/only_page_number.pdf b/tests/files/only_page_number.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7b4ecfe4d797cc1fc19c22ca4f2d5eaf92c13cf0 GIT binary patch literal 2437 zcmc&#O^6&t6c!>vI${obh)|*Ij@E{nE;z2J_@De3a6yx{0r+1sxLvA)as;lb#e($TQ zvH8|q!<|&l*bhH`ebw<8XUWp@tnG2b{!bA?GYL`LWh(HCT#V~6?F%M^%OsbD&>o5K zIOZCoVA9vOox9Mmg;aiuQA_3rig_krp28?4(}=Bn=y?It(q&r6GL{P+!I^^i9`aL~ zX)T!N8m5hey)0~$kGaCSHt@}{AHbhs3b7@yrkrW4@(=;)ruYJJ5vv$cr{tI$x0nkN zFd^Xwap~U#rM@T7JJ^v(6)*{3QkHWCdq5eM|e{2>A%Dk`mWD3 z@~-yD&gHTML?Q_vg%z7MFv*yNj0+h#52y&N8;oUguqQn#5RM6$4FX`TN~Ri1Jv2!o zVNPI+(iQ|D9f{^qPKBNXu+S3$^bo@X&;?1LC(tYqArw|)~X-`k)jf+n}|aVq~`yZ7?6qxK|bnIkbwv>l8OK&8A_kk zKiPTE+tjJZF(R5L7abdS=_YXW=t^*=rkv*S+^#&qeB(3^=iwmg9Eke^%ed1#@zSbg z&ABAbTQ=*Y(Q2NgPBA|gu3%>t+DmEVG!JAQ8|Tb;Fq#{L{S3#vRGgVf)>*sZ;bb@H zD7Zp#7HGBTz4RV_T!$FwamPNjZeZ{Q2S`{^{dqzdXI`%=V9O+}iocFQ?D#`u*I6_V=Iv`tY`uy_bgfzj?g> z+Nlpu|8#lZ_P5#eo-=!2o)I6sbLykc>o?x`?83_VzjvIS82<6>zYfnFxZ zHt?7A;udU{tfd{Bv4TbgBIO@WV1X3MYtE-h=eW&TyNOqcHBZ>}e39xNv1>l;1CseL zrQW+GzO%p<(~J?8!277}MqxoDjs=55uU%0%srXEW1+j_Gk4D{$wJFsF(5xgeg~`CV zWR1eqDdpEqOXb-R=OF1*TCf^rRcmojH#wSgTWBsDZ|xUcaPA8{12W)dANTf1zGnha z(cwHw;@i?oQZ~NY?d2~pF^Oxi!G^=(WU(;WNmk$ufOgk9VAXh(1I)&1xHd={F5Zii zOZ9?rSgy2d%)sI7Y3`&SHC;R)HQ=Sz@*zRg@*A6NCpTAqf{z@|>Yz{7DbVbmtD z=WvU)r-t`a^EjoXt#^h`%iPx(tcFkErJCyme?@3HPI`Ii2<+gl2! j!KH;H$r%xbE|ut%CH#sb9Hx1JsNlYI#>NglHs|~U