Improve hybrid table body discovery algo
While searching for table body boundaries, exclude rows that include cells crossing previously discovered rows.pull/153/head
parent
3220b02ebc
commit
918416e7e4
|
|
@ -17,10 +17,8 @@ from ..core import (
|
|||
)
|
||||
from ..utils import (
|
||||
bbox_from_str,
|
||||
expand_bbox_with_textline,
|
||||
text_in_bbox,
|
||||
bbox_from_textlines,
|
||||
distance_tl_to_bbox,
|
||||
find_columns_coordinates,
|
||||
text_in_bbox_per_axis,
|
||||
)
|
||||
|
|
@ -46,6 +44,44 @@ def column_spread(left, right, col_anchors):
|
|||
return index_right - index_left
|
||||
|
||||
|
||||
def find_closest_tls(bbox, tls):
|
||||
""" Search for tls that are the closest but outside in all 4 directions
|
||||
"""
|
||||
closest = {
|
||||
"left": None,
|
||||
"right": None,
|
||||
"top": None,
|
||||
"bottom": None,
|
||||
}
|
||||
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
|
||||
for tl in tls:
|
||||
if tl.x1 < bbox_left:
|
||||
# Left: check it overlaps horizontally
|
||||
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
||||
continue
|
||||
if closest["left"] is None or closest["left"].x1 < tl.x1:
|
||||
closest["left"] = tl
|
||||
elif bbox_right < tl.x0:
|
||||
# Right: check it overlaps horizontally
|
||||
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
||||
continue
|
||||
if closest["right"] is None or closest["right"].x0 > tl.x0:
|
||||
closest["right"] = tl
|
||||
else:
|
||||
# Either bottom or top: must overlap vertically
|
||||
if tl.x0 > bbox_right or tl.x1 < bbox_left:
|
||||
continue
|
||||
elif tl.y1 < bbox_bottom:
|
||||
# Bottom
|
||||
if closest["bottom"] is None or closest["bottom"].y1 < tl.y1:
|
||||
closest["bottom"] = tl
|
||||
elif bbox_top < tl.y0:
|
||||
# Top
|
||||
if closest["top"] is None or closest["top"].y0 > tl.y0:
|
||||
closest["top"] = tl
|
||||
return closest
|
||||
|
||||
|
||||
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||
"""Expand a bbox vertically up by looking for plausible headers.
|
||||
|
||||
|
|
@ -346,14 +382,16 @@ class TextNetworks(TextAlignments):
|
|||
return gaps_hv
|
||||
|
||||
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
|
||||
""" Seed the process with the textline with the highest alignment
|
||||
""" Build a candidate bbox for the body of a table using hybrid algo
|
||||
|
||||
Seed the process with the textline with the highest alignment
|
||||
score, then expand the bbox with textlines within threshold.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gaps_hv : tuple
|
||||
The maximum distance allowed to consider surrounding lines/columns
|
||||
as part of the same table.
|
||||
The maximum distance allowed to consider surrounding lines/columns
|
||||
as part of the same table.
|
||||
parse_details : array (optional)
|
||||
Optional parameter array, in which to store extra information
|
||||
to help later visualization of the table creation.
|
||||
|
|
@ -381,8 +419,8 @@ class TextNetworks(TextAlignments):
|
|||
else:
|
||||
parse_details_search = None
|
||||
|
||||
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
||||
most_aligned_tl.x1, most_aligned_tl.y1)
|
||||
bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
|
||||
most_aligned_tl.x1, most_aligned_tl.y1]
|
||||
|
||||
# For the body of the table, we only consider cells that have
|
||||
# alignments on both axis.
|
||||
|
|
@ -391,24 +429,64 @@ class TextNetworks(TextAlignments):
|
|||
tls_search_space.remove(most_aligned_tl)
|
||||
tls_in_bbox = [most_aligned_tl]
|
||||
last_bbox = None
|
||||
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
|
||||
while last_bbox != bbox:
|
||||
if parse_details_search is not None:
|
||||
# Store debug info
|
||||
parse_details_search["iterations"].append(bbox)
|
||||
|
||||
# Check that the closest tls are within the gaps allowed
|
||||
last_bbox = bbox
|
||||
# Go through all remaining textlines, expand our bbox
|
||||
# if a textline is within our proximity tolerance
|
||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||
tl = tls_search_space[i]
|
||||
h_distance, v_distance = distance_tl_to_bbox(tl, bbox)
|
||||
cand_bbox = last_bbox.copy()
|
||||
closest_tls = find_closest_tls(bbox, tls_search_space)
|
||||
for direction, tl in closest_tls.items():
|
||||
if tl is None:
|
||||
continue
|
||||
expanded_cand_bbox = cand_bbox.copy()
|
||||
|
||||
if direction == "left":
|
||||
if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]:
|
||||
continue
|
||||
expanded_cand_bbox[0] = tl.x0
|
||||
elif direction == "right":
|
||||
if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
|
||||
continue
|
||||
expanded_cand_bbox[2] = tl.x1
|
||||
elif direction == "bottom":
|
||||
if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]:
|
||||
continue
|
||||
expanded_cand_bbox[1] = tl.y0
|
||||
elif direction == "top":
|
||||
if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
|
||||
continue
|
||||
expanded_cand_bbox[3] = tl.y1
|
||||
|
||||
# If they are, see what an expanded bbox in that direction
|
||||
# would contain
|
||||
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
|
||||
tls_in_new_box = new_tls + tls_in_bbox
|
||||
|
||||
# And if we're expanding up or down, check that the addition
|
||||
# of the new row won't reduce the number of columns.
|
||||
# This happens when text covers multiple rows - that's only
|
||||
# allowed in the header, treated separately.
|
||||
cols_cand = find_columns_coordinates(tls_in_new_box)
|
||||
if direction in ["bottom", "top"]:
|
||||
if len(cols_cand) < len(last_cols_cand):
|
||||
continue
|
||||
|
||||
# We have an expansion candidate: register it, update the
|
||||
# search space and repeat
|
||||
# We use bbox_from_textlines instead of cand_bbox in case some
|
||||
# overlapping textlines require a large bbox for strict fit.
|
||||
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
||||
last_cols_cand = cols_cand
|
||||
tls_in_bbox.extend(new_tls)
|
||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||
tl = tls_search_space[i]
|
||||
if tl in new_tls:
|
||||
del tls_search_space[i]
|
||||
|
||||
# Move textline to our bbox and expand the bbox accordingly
|
||||
# if the textline is close.
|
||||
if h_distance < max_h_gap and v_distance < max_v_gap:
|
||||
tls_in_bbox.append(tl)
|
||||
bbox = expand_bbox_with_textline(bbox, tl)
|
||||
del tls_search_space[i]
|
||||
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
||||
return bbox
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -28,7 +28,8 @@ def draw_labeled_bbox(
|
|||
ax, bbox, text,
|
||||
color="black", linewidth=3,
|
||||
linestyle="solid",
|
||||
label_pos="top,left"
|
||||
label_pos="top,left",
|
||||
fontsize=12,
|
||||
):
|
||||
"""Utility drawing function to draw a box with an associated text label
|
||||
"""
|
||||
|
|
@ -66,10 +67,10 @@ def draw_labeled_bbox(
|
|||
ax.text(
|
||||
x, y,
|
||||
text,
|
||||
fontsize=12, color="black",
|
||||
fontsize=fontsize, color="black",
|
||||
verticalalignment=vlabel_out_of_box,
|
||||
horizontalalignment=hlabel,
|
||||
bbox=dict(facecolor=color, alpha=0.3)
|
||||
bbox=dict(facecolor=color, alpha=0.1)
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -449,12 +450,13 @@ class PlotMethods():
|
|||
|
||||
draw_labeled_bbox(
|
||||
ax, bbox,
|
||||
"box #{box_id} / iter #{iteration}".format(
|
||||
"t{box_id}/i{iteration}".format(
|
||||
box_id=box_id,
|
||||
iteration=iteration
|
||||
),
|
||||
color="red",
|
||||
linewidth=5 if final else 2,
|
||||
fontsize=12 if final else 8,
|
||||
label_pos="bottom,left"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -432,7 +432,7 @@ def bbox_from_str(bbox_str):
|
|||
|
||||
|
||||
def text_in_bbox(bbox, text):
|
||||
"""Returns all text objects present inside a bounding box.
|
||||
"""Returns all text objects which lie at least 50% inside a bounding box.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -529,7 +529,7 @@ def bbox_from_textlines(textlines):
|
|||
return bbox
|
||||
|
||||
|
||||
def find_columns_coordinates(tls):
|
||||
def find_columns_coordinates(tls, min_gap=1.0):
|
||||
"""Given a list of text objects, guess columns boundaries and returns a
|
||||
list of x-coordinates for split points between columns.
|
||||
|
||||
|
|
@ -537,6 +537,10 @@ def find_columns_coordinates(tls):
|
|||
----------
|
||||
tls : list of PDFMiner text object.
|
||||
|
||||
min_gap : minimum distance between columns. Any elements closer than this
|
||||
threshold are merged together. This is to prevent spaces between words
|
||||
to be misinterpreted as column boundaries.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols_anchors : list
|
||||
|
|
@ -549,7 +553,7 @@ def find_columns_coordinates(tls):
|
|||
cols_bounds = []
|
||||
tls.sort(key=lambda tl: tl.x0)
|
||||
for tl in tls:
|
||||
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
|
||||
if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
|
||||
cols_bounds.append([tl.x0, tl.x1])
|
||||
else:
|
||||
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||
|
|
@ -619,44 +623,6 @@ def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
|||
return mid
|
||||
|
||||
|
||||
def distance_tl_to_bbox(tl, bbox):
|
||||
"""Returns a tuple corresponding to the horizontal and vertical gaps
|
||||
between a textline and a bbox.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tl : PDFMiner text object.
|
||||
bbox : tuple (x0, y0, x1, y1)
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : tuple
|
||||
Tuple (horizontal distance, vertical distance)
|
||||
|
||||
"""
|
||||
v_distance, h_distance = None, None
|
||||
if tl.x1 <= bbox[0]:
|
||||
# tl to the left
|
||||
h_distance = bbox[0] - tl.x1
|
||||
elif bbox[2] <= tl.x0:
|
||||
# tl to the right
|
||||
h_distance = tl.x0 - bbox[2]
|
||||
else:
|
||||
# textline overlaps vertically
|
||||
h_distance = 0
|
||||
|
||||
if tl.y1 <= bbox[1]:
|
||||
# tl below
|
||||
v_distance = bbox[1] - tl.y1
|
||||
elif bbox[3] <= tl.y0:
|
||||
# tl above
|
||||
v_distance = tl.y0 - bbox[3]
|
||||
else:
|
||||
# tl overlaps horizontally
|
||||
v_distance = 0
|
||||
return (h_distance, v_distance)
|
||||
|
||||
|
||||
def merge_close_lines(ar, line_tol=2):
|
||||
"""Merges lines which are within a tolerance by calculating a
|
||||
moving mean, based on their x or y axis projections.
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 103 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 88 KiB After Width: | Height: | Size: 88 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 90 KiB |
Loading…
Reference in New Issue