From 55fd4596348967f37c7fb21145aeda0cce8b6996 Mon Sep 17 00:00:00 2001 From: Frh Date: Wed, 29 Apr 2020 12:31:02 -0700 Subject: [PATCH] Minor linting --- camelot/io.py | 17 ++--- camelot/parsers/hybrid.py | 156 +++++++++++++++++++------------------- camelot/utils.py | 10 +-- 3 files changed, 91 insertions(+), 92 deletions(-) diff --git a/camelot/io.py b/camelot/io.py index 58ec530..f688178 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -7,15 +7,14 @@ from .utils import validate_input, remove_extra def read_pdf( - filepath, - pages="1", - password=None, - flavor="lattice", - suppress_stdout=False, - layout_kwargs=None, - debug=False, - **kwargs -): + filepath, + pages="1", + password=None, + flavor="lattice", + suppress_stdout=False, + layout_kwargs=None, + debug=False, + **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 94ba99b..2488ed3 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -47,39 +47,39 @@ def column_spread(left, right, col_anchors): def find_closest_tls(bbox, tls): """ Search for tls that are the closest but outside in all 4 directions """ - closest = { - "left": None, - "right": None, - "top": None, - "bottom": None, - } + left, right, top, bottom = None, None, None, None (bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox - for tl in tls: - if tl.x1 < bbox_left: + for textline in tls: + if textline.x1 < bbox_left: # Left: check it overlaps horizontally - if tl.y0 > bbox_top or tl.y1 < bbox_bottom: + if textline.y0 > bbox_top or textline.y1 < bbox_bottom: continue - if closest["left"] is None or closest["left"].x1 < tl.x1: - closest["left"] = tl - elif bbox_right < tl.x0: + if left is None or left.x1 < textline.x1: + left = textline + elif bbox_right < textline.x0: # Right: check it overlaps horizontally - if tl.y0 > bbox_top or tl.y1 < bbox_bottom: + if textline.y0 > bbox_top or textline.y1 < bbox_bottom: continue - if closest["right"] is None or closest["right"].x0 > tl.x0: - closest["right"] = tl + if right is None or right.x0 > textline.x0: + right = textline else: # Either bottom or top: must overlap vertically - if tl.x0 > bbox_right or tl.x1 < bbox_left: + if textline.x0 > bbox_right or textline.x1 < bbox_left: continue - elif tl.y1 < bbox_bottom: + if textline.y1 < bbox_bottom: # Bottom - if closest["bottom"] is None or closest["bottom"].y1 < tl.y1: - closest["bottom"] = tl - elif bbox_top < tl.y0: + if bottom is None or bottom.y1 < textline.y1: + bottom = textline + elif bbox_top < textline.y0: # Top - if closest["top"] is None or closest["top"].y0 > tl.y0: - closest["top"] = tl - return closest + if top is None or top.y0 > textline.y0: + top = textline + return { + "left": left, + "right": right, + "top": top, + "bottom": bottom, + } def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): @@ -103,16 +103,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): # It will be the anchor for a possible new row. closest_above = None all_above = [] - for te in textlines: + for textline in textlines: # higher than the table, >50% within its bounds - te_center = 0.5 * (te.x0 + te.x1) - if te.y0 > top and left < te_center < right: - all_above.append(te) - if closest_above is None or closest_above.y0 > te.y0: - closest_above = te + textline_center = 0.5 * (textline.x0 + textline.x1) + if textline.y0 > top and left < textline_center < right: + all_above.append(textline) + if closest_above is None or closest_above.y0 > textline.y0: + closest_above = textline - if closest_above and \ - closest_above.y0 < top + max_v_gap: + if closest_above and closest_above.y0 < top + max_v_gap: # b/ We have a candidate cell that is within the correct # vertical band, and directly above the table. Starting from # this anchor, we list all the textlines within the same row. @@ -124,23 +123,27 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): # Iterate and extract elements that fit in the row # from our list for i in range(len(all_above) - 1, -1, -1): - te = all_above[i] - if te.y0 < top: + textline = all_above[i] + if textline.y0 < top: # The bottom of this element is within our row # so we add it. - tls_in_new_row.append(te) + tls_in_new_row.append(textline) all_above.pop(i) - if te.y1 > top: + if textline.y1 > top: # If the top of this element raises our row's # band, we'll need to keep on searching for # overlapping items - top = te.y1 + top = textline.y1 pushed_up = True # Get the x-ranges for all the textlines, and merge the # x-ranges that overlap - zones = zones + \ - list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row)) + zones = zones + list( + map( + lambda textline: [textline.x0, textline.x1], + tls_in_new_row + ) + ) zones.sort(key=lambda z: z[0]) # Sort by left coordinate # Starting from the right, if two zones overlap horizontally, # merge them @@ -277,9 +280,9 @@ class TextNetworks(TextAlignments): identify alignments. """ # Identify all the alignments - for tl in textlines: - if len(tl.get_text().strip()) > 0: - self._register_textline(tl) + for textline in textlines: + if len(textline.get_text().strip()) > 0: + self._register_textline(textline) def _compute_alignment_counts(self): """Build a dictionary textline -> alignment object. @@ -294,7 +297,7 @@ class TextNetworks(TextAlignments): self._textline_to_alignments[textline] = alignments alignments[align_id] = textedge.textlines - def _remove_unconnected_edges(self): + def remove_unconnected_edges(self): """Weed out elements which are only connected to others vertically or horizontally. There needs to be connections across both dimensions. @@ -302,16 +305,16 @@ class TextNetworks(TextAlignments): removed_singletons = True while removed_singletons: removed_singletons = False - for textalignments in self._text_alignments.values(): + for text_alignments in self._text_alignments.values(): # For each alignment edge, remove items if they are singletons # either horizontally or vertically - for ta in textalignments: - for i in range(len(ta.textlines) - 1, -1, -1): - tl = ta.textlines[i] - alignments = self._textline_to_alignments[tl] + for text_alignment in text_alignments: + for i in range(len(text_alignment.textlines) - 1, -1, -1): + textline = text_alignment.textlines[i] + alignments = self._textline_to_alignments[textline] if alignments.max_h_count() <= 1 or \ alignments.max_v_count() <= 1: - del ta.textlines[i] + del text_alignment.textlines[i] removed_singletons = True self._textline_to_alignments = {} self._compute_alignment_counts() @@ -335,7 +338,7 @@ class TextNetworks(TextAlignments): default=None ) - def _compute_plausible_gaps(self): + def compute_plausible_gaps(self): """ Evaluate plausible gaps between cells horizontally and vertically based on the textlines aligned with the most connected textline. @@ -363,12 +366,12 @@ class TextNetworks(TextAlignments): h_textlines = sorted( ref_h_textlines, - key=lambda tl: tl.x0, + key=lambda textline: textline.x0, reverse=True ) v_textlines = sorted( ref_v_textlines, - key=lambda tl: tl.y0, + key=lambda textline: textline.y0, reverse=True ) @@ -387,7 +390,7 @@ class TextNetworks(TextAlignments): ) return gaps_hv - def _build_bbox_candidate(self, gaps_hv, parse_details=None): + def search_table_body(self, gaps_hv, parse_details=None): """ Build a candidate bbox for the body of a table using hybrid algo Seed the process with the textline with the highest alignment @@ -445,27 +448,27 @@ class TextNetworks(TextAlignments): last_bbox = bbox cand_bbox = last_bbox.copy() closest_tls = find_closest_tls(bbox, tls_search_space) - for direction, tl in closest_tls.items(): - if tl is None: + for direction, textline in closest_tls.items(): + if textline is None: continue expanded_cand_bbox = cand_bbox.copy() if direction == "left": - if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]: + if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]: continue - expanded_cand_bbox[0] = tl.x0 + expanded_cand_bbox[0] = textline.x0 elif direction == "right": - if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]: + if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]: continue - expanded_cand_bbox[2] = tl.x1 + expanded_cand_bbox[2] = textline.x1 elif direction == "bottom": - if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]: + if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]: continue - expanded_cand_bbox[1] = tl.y0 + expanded_cand_bbox[1] = textline.y0 elif direction == "top": - if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]: + if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]: continue - expanded_cand_bbox[3] = tl.y1 + expanded_cand_bbox[3] = textline.y1 # If they are, see what an expanded bbox in that direction # would contain @@ -477,9 +480,9 @@ class TextNetworks(TextAlignments): # This happens when text covers multiple rows - that's only # allowed in the header, treated separately. cols_cand = find_columns_coordinates(tls_in_new_box) - if direction in ["bottom", "top"]: - if len(cols_cand) < len(last_cols_cand): - continue + if direction in ["bottom", "top"] and \ + len(cols_cand) < len(last_cols_cand): + continue # We have an expansion candidate: register it, update the # search space and repeat @@ -489,8 +492,8 @@ class TextNetworks(TextAlignments): last_cols_cand = cols_cand tls_in_bbox.extend(new_tls) for i in range(len(tls_search_space) - 1, -1, -1): - tl = tls_search_space[i] - if tl in new_tls: + textline = tls_search_space[i] + if textline in new_tls: del tls_search_space[i] if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE: @@ -595,6 +598,7 @@ class Hybrid(TextBaseParser): parse_details_network_searches parse_details_bbox_searches = [] self.parse_details["bbox_searches"] = parse_details_bbox_searches + self.parse_details["col_searches"] = [] else: parse_details_network_searches = None parse_details_bbox_searches = None @@ -611,8 +615,8 @@ class Hybrid(TextBaseParser): else: text_network = TextNetworks() text_network.generate(textlines) - text_network._remove_unconnected_edges() - gaps_hv = text_network._compute_plausible_gaps() + text_network.remove_unconnected_edges() + gaps_hv = text_network.compute_plausible_gaps() if gaps_hv is None: return None # edge_tol instructions override the calculated vertical gap @@ -620,7 +624,7 @@ class Hybrid(TextBaseParser): gaps_hv[0], gaps_hv[1] if self.edge_tol is None else self.edge_tol ) - bbox_body = text_network._build_bbox_candidate( + bbox_body = text_network.search_table_body( edge_tol_hv, parse_details=parse_details_bbox_searches ) @@ -664,15 +668,13 @@ class Hybrid(TextBaseParser): self.table_bbox[bbox_full] = table_parse if self.parse_details is not None: - if "col_searches" not in self.parse_details: - self.parse_details["col_searches"] = [] self.parse_details["col_searches"].append(table_parse) # Remember what textlines we processed, and repeat - for tl in tls_in_bbox: - textlines_processed[tl] = None + for textline in tls_in_bbox: + textlines_processed[textline] = None textlines = list(filter( - lambda tl: tl not in textlines_processed, + lambda textline: textline not in textlines_processed, textlines )) @@ -687,10 +689,10 @@ class Hybrid(TextBaseParser): all_tls = list( sorted( filter( - lambda tl: len(tl.get_text().strip()) > 0, + lambda textline: len(textline.get_text().strip()) > 0, self.t_bbox["horizontal"] + self.t_bbox["vertical"] ), - key=lambda tl: (-tl.y0, tl.x0) + key=lambda textline: (-textline.y0, textline.x0) ) ) text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( diff --git a/camelot/utils.py b/camelot/utils.py index 5dcba39..1432259 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -640,12 +640,12 @@ def get_index_closest_point(point, sorted_list, fn=lambda x: x): if mid_val > point: if mid > 0 and ( - point - fn(sorted_list[mid-1]) < + point - fn(sorted_list[mid-1]) < mid_val - point): return mid-1 elif mid_val < point: if mid < n - 1 and ( - fn(sorted_list[mid+1]) - point < + fn(sorted_list[mid+1]) - point < point - mid_val): return mid+1 return mid @@ -844,10 +844,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): col = table.cols[c] for cut in y_cuts: if isinstance(obj, LTChar): - if ( - col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] - and (obj.y0 + obj.y1) / 2 >= cut[1] - ): + if col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] \ + and (obj.y0 + obj.y1) / 2 >= cut[1]: cut_text.append((cut[0], c, obj)) break else: