Improve hybrid table body discovery algo

While searching for table body boundaries, exclude rows that include
cells crossing previously discovered rows.
pull/153/head
Frh 2020-04-28 22:43:55 -07:00
parent a04e7702b2
commit 21dc6a46a0
8 changed files with 192 additions and 114 deletions

View File

@ -17,10 +17,8 @@ from ..core import (
)
from ..utils import (
bbox_from_str,
expand_bbox_with_textline,
text_in_bbox,
bbox_from_textlines,
distance_tl_to_bbox,
find_columns_coordinates,
text_in_bbox_per_axis,
)
@ -46,6 +44,44 @@ def column_spread(left, right, col_anchors):
return index_right - index_left
def find_closest_tls(bbox, tls):
""" Search for tls that are the closest but outside in all 4 directions
"""
closest = {
"left": None,
"right": None,
"top": None,
"bottom": None,
}
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
for tl in tls:
if tl.x1 < bbox_left:
# Left: check it overlaps horizontally
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
continue
if closest["left"] is None or closest["left"].x1 < tl.x1:
closest["left"] = tl
elif bbox_right < tl.x0:
# Right: check it overlaps horizontally
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
continue
if closest["right"] is None or closest["right"].x0 > tl.x0:
closest["right"] = tl
else:
# Either bottom or top: must overlap vertically
if tl.x0 > bbox_right or tl.x1 < bbox_left:
continue
elif tl.y1 < bbox_bottom:
# Bottom
if closest["bottom"] is None or closest["bottom"].y1 < tl.y1:
closest["bottom"] = tl
elif bbox_top < tl.y0:
# Top
if closest["top"] is None or closest["top"].y0 > tl.y0:
closest["top"] = tl
return closest
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
"""Expand a bbox vertically up by looking for plausible headers.
@ -346,7 +382,9 @@ class TextNetworks(TextAlignments):
return gaps_hv
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
""" Seed the process with the textline with the highest alignment
""" Build a candidate bbox for the body of a table using hybrid algo
Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold.
Parameters
@ -381,8 +419,8 @@ class TextNetworks(TextAlignments):
else:
parse_details_search = None
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1)
bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1]
# For the body of the table, we only consider cells that have
# alignments on both axis.
@ -391,24 +429,64 @@ class TextNetworks(TextAlignments):
tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl]
last_bbox = None
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
while last_bbox != bbox:
if parse_details_search is not None:
# Store debug info
parse_details_search["iterations"].append(bbox)
# Check that the closest tls are within the gaps allowed
last_bbox = bbox
# Go through all remaining textlines, expand our bbox
# if a textline is within our proximity tolerance
cand_bbox = last_bbox.copy()
closest_tls = find_closest_tls(bbox, tls_search_space)
for direction, tl in closest_tls.items():
if tl is None:
continue
expanded_cand_bbox = cand_bbox.copy()
if direction == "left":
if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]:
continue
expanded_cand_bbox[0] = tl.x0
elif direction == "right":
if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
continue
expanded_cand_bbox[2] = tl.x1
elif direction == "bottom":
if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]:
continue
expanded_cand_bbox[1] = tl.y0
elif direction == "top":
if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
continue
expanded_cand_bbox[3] = tl.y1
# If they are, see what an expanded bbox in that direction
# would contain
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
tls_in_new_box = new_tls + tls_in_bbox
# And if we're expanding up or down, check that the addition
# of the new row won't reduce the number of columns.
# This happens when text covers multiple rows - that's only
# allowed in the header, treated separately.
cols_cand = find_columns_coordinates(tls_in_new_box)
if direction in ["bottom", "top"]:
if len(cols_cand) < len(last_cols_cand):
continue
# We have an expansion candidate: register it, update the
# search space and repeat
# We use bbox_from_textlines instead of cand_bbox in case some
# overlapping textlines require a large bbox for strict fit.
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
last_cols_cand = cols_cand
tls_in_bbox.extend(new_tls)
for i in range(len(tls_search_space) - 1, -1, -1):
tl = tls_search_space[i]
h_distance, v_distance = distance_tl_to_bbox(tl, bbox)
# Move textline to our bbox and expand the bbox accordingly
# if the textline is close.
if h_distance < max_h_gap and v_distance < max_v_gap:
tls_in_bbox.append(tl)
bbox = expand_bbox_with_textline(bbox, tl)
if tl in new_tls:
del tls_search_space[i]
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
return bbox
return None

View File

@ -28,7 +28,8 @@ def draw_labeled_bbox(
ax, bbox, text,
color="black", linewidth=3,
linestyle="solid",
label_pos="top,left"
label_pos="top,left",
fontsize=12,
):
"""Utility drawing function to draw a box with an associated text label
"""
@ -66,10 +67,10 @@ def draw_labeled_bbox(
ax.text(
x, y,
text,
fontsize=12, color="black",
fontsize=fontsize, color="black",
verticalalignment=vlabel_out_of_box,
horizontalalignment=hlabel,
bbox=dict(facecolor=color, alpha=0.3)
bbox=dict(facecolor=color, alpha=0.1)
)
@ -449,12 +450,13 @@ class PlotMethods():
draw_labeled_bbox(
ax, bbox,
"box #{box_id} / iter #{iteration}".format(
"t{box_id}/i{iteration}".format(
box_id=box_id,
iteration=iteration
),
color="red",
linewidth=5 if final else 2,
fontsize=12 if final else 8,
label_pos="bottom,left"
)

View File

@ -432,7 +432,7 @@ def bbox_from_str(bbox_str):
def text_in_bbox(bbox, text):
"""Returns all text objects present inside a bounding box.
"""Returns all text objects which lie at least 50% inside a bounding box.
Parameters
----------
@ -529,7 +529,7 @@ def bbox_from_textlines(textlines):
return bbox
def find_columns_coordinates(tls):
def find_columns_coordinates(tls, min_gap=1.0):
"""Given a list of text objects, guess columns boundaries and returns a
list of x-coordinates for split points between columns.
@ -537,6 +537,10 @@ def find_columns_coordinates(tls):
----------
tls : list of PDFMiner text object.
min_gap : minimum distance between columns. Any elements closer than this
threshold are merged together. This is to prevent spaces between words
to be misinterpreted as column boundaries.
Returns
-------
cols_anchors : list
@ -549,7 +553,7 @@ def find_columns_coordinates(tls):
cols_bounds = []
tls.sort(key=lambda tl: tl.x0)
for tl in tls:
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
cols_bounds.append([tl.x0, tl.x1])
else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
@ -619,44 +623,6 @@ def get_index_closest_point(point, sorted_list, fn=lambda x: x):
return mid
def distance_tl_to_bbox(tl, bbox):
"""Returns a tuple corresponding to the horizontal and vertical gaps
between a textline and a bbox.
Parameters
----------
tl : PDFMiner text object.
bbox : tuple (x0, y0, x1, y1)
Returns
-------
distance : tuple
Tuple (horizontal distance, vertical distance)
"""
v_distance, h_distance = None, None
if tl.x1 <= bbox[0]:
# tl to the left
h_distance = bbox[0] - tl.x1
elif bbox[2] <= tl.x0:
# tl to the right
h_distance = tl.x0 - bbox[2]
else:
# textline overlaps vertically
h_distance = 0
if tl.y1 <= bbox[1]:
# tl below
v_distance = bbox[1] - tl.y1
elif bbox[3] <= tl.y0:
# tl above
v_distance = tl.y0 - bbox[3]
else:
# tl overlaps horizontally
v_distance = 0
return (h_distance, v_distance)
def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB