Minor linting
parent
ada4809a59
commit
55fd459634
|
|
@ -14,8 +14,7 @@ def read_pdf(
|
||||||
suppress_stdout=False,
|
suppress_stdout=False,
|
||||||
layout_kwargs=None,
|
layout_kwargs=None,
|
||||||
debug=False,
|
debug=False,
|
||||||
**kwargs
|
**kwargs):
|
||||||
):
|
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
|
||||||
|
|
@ -47,39 +47,39 @@ def column_spread(left, right, col_anchors):
|
||||||
def find_closest_tls(bbox, tls):
|
def find_closest_tls(bbox, tls):
|
||||||
""" Search for tls that are the closest but outside in all 4 directions
|
""" Search for tls that are the closest but outside in all 4 directions
|
||||||
"""
|
"""
|
||||||
closest = {
|
left, right, top, bottom = None, None, None, None
|
||||||
"left": None,
|
|
||||||
"right": None,
|
|
||||||
"top": None,
|
|
||||||
"bottom": None,
|
|
||||||
}
|
|
||||||
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
|
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
|
||||||
for tl in tls:
|
for textline in tls:
|
||||||
if tl.x1 < bbox_left:
|
if textline.x1 < bbox_left:
|
||||||
# Left: check it overlaps horizontally
|
# Left: check it overlaps horizontally
|
||||||
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
|
||||||
continue
|
continue
|
||||||
if closest["left"] is None or closest["left"].x1 < tl.x1:
|
if left is None or left.x1 < textline.x1:
|
||||||
closest["left"] = tl
|
left = textline
|
||||||
elif bbox_right < tl.x0:
|
elif bbox_right < textline.x0:
|
||||||
# Right: check it overlaps horizontally
|
# Right: check it overlaps horizontally
|
||||||
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
|
||||||
continue
|
continue
|
||||||
if closest["right"] is None or closest["right"].x0 > tl.x0:
|
if right is None or right.x0 > textline.x0:
|
||||||
closest["right"] = tl
|
right = textline
|
||||||
else:
|
else:
|
||||||
# Either bottom or top: must overlap vertically
|
# Either bottom or top: must overlap vertically
|
||||||
if tl.x0 > bbox_right or tl.x1 < bbox_left:
|
if textline.x0 > bbox_right or textline.x1 < bbox_left:
|
||||||
continue
|
continue
|
||||||
elif tl.y1 < bbox_bottom:
|
if textline.y1 < bbox_bottom:
|
||||||
# Bottom
|
# Bottom
|
||||||
if closest["bottom"] is None or closest["bottom"].y1 < tl.y1:
|
if bottom is None or bottom.y1 < textline.y1:
|
||||||
closest["bottom"] = tl
|
bottom = textline
|
||||||
elif bbox_top < tl.y0:
|
elif bbox_top < textline.y0:
|
||||||
# Top
|
# Top
|
||||||
if closest["top"] is None or closest["top"].y0 > tl.y0:
|
if top is None or top.y0 > textline.y0:
|
||||||
closest["top"] = tl
|
top = textline
|
||||||
return closest
|
return {
|
||||||
|
"left": left,
|
||||||
|
"right": right,
|
||||||
|
"top": top,
|
||||||
|
"bottom": bottom,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
|
|
@ -103,16 +103,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
# It will be the anchor for a possible new row.
|
# It will be the anchor for a possible new row.
|
||||||
closest_above = None
|
closest_above = None
|
||||||
all_above = []
|
all_above = []
|
||||||
for te in textlines:
|
for textline in textlines:
|
||||||
# higher than the table, >50% within its bounds
|
# higher than the table, >50% within its bounds
|
||||||
te_center = 0.5 * (te.x0 + te.x1)
|
textline_center = 0.5 * (textline.x0 + textline.x1)
|
||||||
if te.y0 > top and left < te_center < right:
|
if textline.y0 > top and left < textline_center < right:
|
||||||
all_above.append(te)
|
all_above.append(textline)
|
||||||
if closest_above is None or closest_above.y0 > te.y0:
|
if closest_above is None or closest_above.y0 > textline.y0:
|
||||||
closest_above = te
|
closest_above = textline
|
||||||
|
|
||||||
if closest_above and \
|
if closest_above and closest_above.y0 < top + max_v_gap:
|
||||||
closest_above.y0 < top + max_v_gap:
|
|
||||||
# b/ We have a candidate cell that is within the correct
|
# b/ We have a candidate cell that is within the correct
|
||||||
# vertical band, and directly above the table. Starting from
|
# vertical band, and directly above the table. Starting from
|
||||||
# this anchor, we list all the textlines within the same row.
|
# this anchor, we list all the textlines within the same row.
|
||||||
|
|
@ -124,23 +123,27 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
# Iterate and extract elements that fit in the row
|
# Iterate and extract elements that fit in the row
|
||||||
# from our list
|
# from our list
|
||||||
for i in range(len(all_above) - 1, -1, -1):
|
for i in range(len(all_above) - 1, -1, -1):
|
||||||
te = all_above[i]
|
textline = all_above[i]
|
||||||
if te.y0 < top:
|
if textline.y0 < top:
|
||||||
# The bottom of this element is within our row
|
# The bottom of this element is within our row
|
||||||
# so we add it.
|
# so we add it.
|
||||||
tls_in_new_row.append(te)
|
tls_in_new_row.append(textline)
|
||||||
all_above.pop(i)
|
all_above.pop(i)
|
||||||
if te.y1 > top:
|
if textline.y1 > top:
|
||||||
# If the top of this element raises our row's
|
# If the top of this element raises our row's
|
||||||
# band, we'll need to keep on searching for
|
# band, we'll need to keep on searching for
|
||||||
# overlapping items
|
# overlapping items
|
||||||
top = te.y1
|
top = textline.y1
|
||||||
pushed_up = True
|
pushed_up = True
|
||||||
|
|
||||||
# Get the x-ranges for all the textlines, and merge the
|
# Get the x-ranges for all the textlines, and merge the
|
||||||
# x-ranges that overlap
|
# x-ranges that overlap
|
||||||
zones = zones + \
|
zones = zones + list(
|
||||||
list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row))
|
map(
|
||||||
|
lambda textline: [textline.x0, textline.x1],
|
||||||
|
tls_in_new_row
|
||||||
|
)
|
||||||
|
)
|
||||||
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
|
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
|
||||||
# Starting from the right, if two zones overlap horizontally,
|
# Starting from the right, if two zones overlap horizontally,
|
||||||
# merge them
|
# merge them
|
||||||
|
|
@ -277,9 +280,9 @@ class TextNetworks(TextAlignments):
|
||||||
identify alignments.
|
identify alignments.
|
||||||
"""
|
"""
|
||||||
# Identify all the alignments
|
# Identify all the alignments
|
||||||
for tl in textlines:
|
for textline in textlines:
|
||||||
if len(tl.get_text().strip()) > 0:
|
if len(textline.get_text().strip()) > 0:
|
||||||
self._register_textline(tl)
|
self._register_textline(textline)
|
||||||
|
|
||||||
def _compute_alignment_counts(self):
|
def _compute_alignment_counts(self):
|
||||||
"""Build a dictionary textline -> alignment object.
|
"""Build a dictionary textline -> alignment object.
|
||||||
|
|
@ -294,7 +297,7 @@ class TextNetworks(TextAlignments):
|
||||||
self._textline_to_alignments[textline] = alignments
|
self._textline_to_alignments[textline] = alignments
|
||||||
alignments[align_id] = textedge.textlines
|
alignments[align_id] = textedge.textlines
|
||||||
|
|
||||||
def _remove_unconnected_edges(self):
|
def remove_unconnected_edges(self):
|
||||||
"""Weed out elements which are only connected to others vertically
|
"""Weed out elements which are only connected to others vertically
|
||||||
or horizontally. There needs to be connections across both
|
or horizontally. There needs to be connections across both
|
||||||
dimensions.
|
dimensions.
|
||||||
|
|
@ -302,16 +305,16 @@ class TextNetworks(TextAlignments):
|
||||||
removed_singletons = True
|
removed_singletons = True
|
||||||
while removed_singletons:
|
while removed_singletons:
|
||||||
removed_singletons = False
|
removed_singletons = False
|
||||||
for textalignments in self._text_alignments.values():
|
for text_alignments in self._text_alignments.values():
|
||||||
# For each alignment edge, remove items if they are singletons
|
# For each alignment edge, remove items if they are singletons
|
||||||
# either horizontally or vertically
|
# either horizontally or vertically
|
||||||
for ta in textalignments:
|
for text_alignment in text_alignments:
|
||||||
for i in range(len(ta.textlines) - 1, -1, -1):
|
for i in range(len(text_alignment.textlines) - 1, -1, -1):
|
||||||
tl = ta.textlines[i]
|
textline = text_alignment.textlines[i]
|
||||||
alignments = self._textline_to_alignments[tl]
|
alignments = self._textline_to_alignments[textline]
|
||||||
if alignments.max_h_count() <= 1 or \
|
if alignments.max_h_count() <= 1 or \
|
||||||
alignments.max_v_count() <= 1:
|
alignments.max_v_count() <= 1:
|
||||||
del ta.textlines[i]
|
del text_alignment.textlines[i]
|
||||||
removed_singletons = True
|
removed_singletons = True
|
||||||
self._textline_to_alignments = {}
|
self._textline_to_alignments = {}
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
@ -335,7 +338,7 @@ class TextNetworks(TextAlignments):
|
||||||
default=None
|
default=None
|
||||||
)
|
)
|
||||||
|
|
||||||
def _compute_plausible_gaps(self):
|
def compute_plausible_gaps(self):
|
||||||
""" Evaluate plausible gaps between cells horizontally and vertically
|
""" Evaluate plausible gaps between cells horizontally and vertically
|
||||||
based on the textlines aligned with the most connected textline.
|
based on the textlines aligned with the most connected textline.
|
||||||
|
|
||||||
|
|
@ -363,12 +366,12 @@ class TextNetworks(TextAlignments):
|
||||||
|
|
||||||
h_textlines = sorted(
|
h_textlines = sorted(
|
||||||
ref_h_textlines,
|
ref_h_textlines,
|
||||||
key=lambda tl: tl.x0,
|
key=lambda textline: textline.x0,
|
||||||
reverse=True
|
reverse=True
|
||||||
)
|
)
|
||||||
v_textlines = sorted(
|
v_textlines = sorted(
|
||||||
ref_v_textlines,
|
ref_v_textlines,
|
||||||
key=lambda tl: tl.y0,
|
key=lambda textline: textline.y0,
|
||||||
reverse=True
|
reverse=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -387,7 +390,7 @@ class TextNetworks(TextAlignments):
|
||||||
)
|
)
|
||||||
return gaps_hv
|
return gaps_hv
|
||||||
|
|
||||||
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
|
def search_table_body(self, gaps_hv, parse_details=None):
|
||||||
""" Build a candidate bbox for the body of a table using hybrid algo
|
""" Build a candidate bbox for the body of a table using hybrid algo
|
||||||
|
|
||||||
Seed the process with the textline with the highest alignment
|
Seed the process with the textline with the highest alignment
|
||||||
|
|
@ -445,27 +448,27 @@ class TextNetworks(TextAlignments):
|
||||||
last_bbox = bbox
|
last_bbox = bbox
|
||||||
cand_bbox = last_bbox.copy()
|
cand_bbox = last_bbox.copy()
|
||||||
closest_tls = find_closest_tls(bbox, tls_search_space)
|
closest_tls = find_closest_tls(bbox, tls_search_space)
|
||||||
for direction, tl in closest_tls.items():
|
for direction, textline in closest_tls.items():
|
||||||
if tl is None:
|
if textline is None:
|
||||||
continue
|
continue
|
||||||
expanded_cand_bbox = cand_bbox.copy()
|
expanded_cand_bbox = cand_bbox.copy()
|
||||||
|
|
||||||
if direction == "left":
|
if direction == "left":
|
||||||
if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]:
|
if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]:
|
||||||
continue
|
continue
|
||||||
expanded_cand_bbox[0] = tl.x0
|
expanded_cand_bbox[0] = textline.x0
|
||||||
elif direction == "right":
|
elif direction == "right":
|
||||||
if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
|
if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
|
||||||
continue
|
continue
|
||||||
expanded_cand_bbox[2] = tl.x1
|
expanded_cand_bbox[2] = textline.x1
|
||||||
elif direction == "bottom":
|
elif direction == "bottom":
|
||||||
if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]:
|
if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]:
|
||||||
continue
|
continue
|
||||||
expanded_cand_bbox[1] = tl.y0
|
expanded_cand_bbox[1] = textline.y0
|
||||||
elif direction == "top":
|
elif direction == "top":
|
||||||
if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
|
if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
|
||||||
continue
|
continue
|
||||||
expanded_cand_bbox[3] = tl.y1
|
expanded_cand_bbox[3] = textline.y1
|
||||||
|
|
||||||
# If they are, see what an expanded bbox in that direction
|
# If they are, see what an expanded bbox in that direction
|
||||||
# would contain
|
# would contain
|
||||||
|
|
@ -477,8 +480,8 @@ class TextNetworks(TextAlignments):
|
||||||
# This happens when text covers multiple rows - that's only
|
# This happens when text covers multiple rows - that's only
|
||||||
# allowed in the header, treated separately.
|
# allowed in the header, treated separately.
|
||||||
cols_cand = find_columns_coordinates(tls_in_new_box)
|
cols_cand = find_columns_coordinates(tls_in_new_box)
|
||||||
if direction in ["bottom", "top"]:
|
if direction in ["bottom", "top"] and \
|
||||||
if len(cols_cand) < len(last_cols_cand):
|
len(cols_cand) < len(last_cols_cand):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# We have an expansion candidate: register it, update the
|
# We have an expansion candidate: register it, update the
|
||||||
|
|
@ -489,8 +492,8 @@ class TextNetworks(TextAlignments):
|
||||||
last_cols_cand = cols_cand
|
last_cols_cand = cols_cand
|
||||||
tls_in_bbox.extend(new_tls)
|
tls_in_bbox.extend(new_tls)
|
||||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||||
tl = tls_search_space[i]
|
textline = tls_search_space[i]
|
||||||
if tl in new_tls:
|
if textline in new_tls:
|
||||||
del tls_search_space[i]
|
del tls_search_space[i]
|
||||||
|
|
||||||
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
||||||
|
|
@ -595,6 +598,7 @@ class Hybrid(TextBaseParser):
|
||||||
parse_details_network_searches
|
parse_details_network_searches
|
||||||
parse_details_bbox_searches = []
|
parse_details_bbox_searches = []
|
||||||
self.parse_details["bbox_searches"] = parse_details_bbox_searches
|
self.parse_details["bbox_searches"] = parse_details_bbox_searches
|
||||||
|
self.parse_details["col_searches"] = []
|
||||||
else:
|
else:
|
||||||
parse_details_network_searches = None
|
parse_details_network_searches = None
|
||||||
parse_details_bbox_searches = None
|
parse_details_bbox_searches = None
|
||||||
|
|
@ -611,8 +615,8 @@ class Hybrid(TextBaseParser):
|
||||||
else:
|
else:
|
||||||
text_network = TextNetworks()
|
text_network = TextNetworks()
|
||||||
text_network.generate(textlines)
|
text_network.generate(textlines)
|
||||||
text_network._remove_unconnected_edges()
|
text_network.remove_unconnected_edges()
|
||||||
gaps_hv = text_network._compute_plausible_gaps()
|
gaps_hv = text_network.compute_plausible_gaps()
|
||||||
if gaps_hv is None:
|
if gaps_hv is None:
|
||||||
return None
|
return None
|
||||||
# edge_tol instructions override the calculated vertical gap
|
# edge_tol instructions override the calculated vertical gap
|
||||||
|
|
@ -620,7 +624,7 @@ class Hybrid(TextBaseParser):
|
||||||
gaps_hv[0],
|
gaps_hv[0],
|
||||||
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
||||||
)
|
)
|
||||||
bbox_body = text_network._build_bbox_candidate(
|
bbox_body = text_network.search_table_body(
|
||||||
edge_tol_hv,
|
edge_tol_hv,
|
||||||
parse_details=parse_details_bbox_searches
|
parse_details=parse_details_bbox_searches
|
||||||
)
|
)
|
||||||
|
|
@ -664,15 +668,13 @@ class Hybrid(TextBaseParser):
|
||||||
self.table_bbox[bbox_full] = table_parse
|
self.table_bbox[bbox_full] = table_parse
|
||||||
|
|
||||||
if self.parse_details is not None:
|
if self.parse_details is not None:
|
||||||
if "col_searches" not in self.parse_details:
|
|
||||||
self.parse_details["col_searches"] = []
|
|
||||||
self.parse_details["col_searches"].append(table_parse)
|
self.parse_details["col_searches"].append(table_parse)
|
||||||
|
|
||||||
# Remember what textlines we processed, and repeat
|
# Remember what textlines we processed, and repeat
|
||||||
for tl in tls_in_bbox:
|
for textline in tls_in_bbox:
|
||||||
textlines_processed[tl] = None
|
textlines_processed[textline] = None
|
||||||
textlines = list(filter(
|
textlines = list(filter(
|
||||||
lambda tl: tl not in textlines_processed,
|
lambda textline: textline not in textlines_processed,
|
||||||
textlines
|
textlines
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
@ -687,10 +689,10 @@ class Hybrid(TextBaseParser):
|
||||||
all_tls = list(
|
all_tls = list(
|
||||||
sorted(
|
sorted(
|
||||||
filter(
|
filter(
|
||||||
lambda tl: len(tl.get_text().strip()) > 0,
|
lambda textline: len(textline.get_text().strip()) > 0,
|
||||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
),
|
),
|
||||||
key=lambda tl: (-tl.y0, tl.x0)
|
key=lambda textline: (-textline.y0, textline.x0)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
|
|
|
||||||
|
|
@ -844,10 +844,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
|
||||||
col = table.cols[c]
|
col = table.cols[c]
|
||||||
for cut in y_cuts:
|
for cut in y_cuts:
|
||||||
if isinstance(obj, LTChar):
|
if isinstance(obj, LTChar):
|
||||||
if (
|
if col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] \
|
||||||
col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
|
and (obj.y0 + obj.y1) / 2 >= cut[1]:
|
||||||
and (obj.y0 + obj.y1) / 2 >= cut[1]
|
|
||||||
):
|
|
||||||
cut_text.append((cut[0], c, obj))
|
cut_text.append((cut[0], c, obj))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue