Improve hybrid table body discovery algo

While searching for table body boundaries, exclude rows that include
cells crossing previously discovered rows.
pull/153/head
Frh 2020-04-28 22:43:55 -07:00
parent 3220b02ebc
commit 918416e7e4
8 changed files with 192 additions and 114 deletions

View File

@ -17,10 +17,8 @@ from ..core import (
) )
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
expand_bbox_with_textline,
text_in_bbox, text_in_bbox,
bbox_from_textlines, bbox_from_textlines,
distance_tl_to_bbox,
find_columns_coordinates, find_columns_coordinates,
text_in_bbox_per_axis, text_in_bbox_per_axis,
) )
@ -46,6 +44,44 @@ def column_spread(left, right, col_anchors):
return index_right - index_left return index_right - index_left
def find_closest_tls(bbox, tls):
""" Search for tls that are the closest but outside in all 4 directions
"""
closest = {
"left": None,
"right": None,
"top": None,
"bottom": None,
}
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
for tl in tls:
if tl.x1 < bbox_left:
# Left: check it overlaps horizontally
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
continue
if closest["left"] is None or closest["left"].x1 < tl.x1:
closest["left"] = tl
elif bbox_right < tl.x0:
# Right: check it overlaps horizontally
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
continue
if closest["right"] is None or closest["right"].x0 > tl.x0:
closest["right"] = tl
else:
# Either bottom or top: must overlap vertically
if tl.x0 > bbox_right or tl.x1 < bbox_left:
continue
elif tl.y1 < bbox_bottom:
# Bottom
if closest["bottom"] is None or closest["bottom"].y1 < tl.y1:
closest["bottom"] = tl
elif bbox_top < tl.y0:
# Top
if closest["top"] is None or closest["top"].y0 > tl.y0:
closest["top"] = tl
return closest
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
"""Expand a bbox vertically up by looking for plausible headers. """Expand a bbox vertically up by looking for plausible headers.
@ -346,7 +382,9 @@ class TextNetworks(TextAlignments):
return gaps_hv return gaps_hv
def _build_bbox_candidate(self, gaps_hv, parse_details=None): def _build_bbox_candidate(self, gaps_hv, parse_details=None):
""" Seed the process with the textline with the highest alignment """ Build a candidate bbox for the body of a table using hybrid algo
Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold. score, then expand the bbox with textlines within threshold.
Parameters Parameters
@ -381,8 +419,8 @@ class TextNetworks(TextAlignments):
else: else:
parse_details_search = None parse_details_search = None
bbox = (most_aligned_tl.x0, most_aligned_tl.y0, bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1) most_aligned_tl.x1, most_aligned_tl.y1]
# For the body of the table, we only consider cells that have # For the body of the table, we only consider cells that have
# alignments on both axis. # alignments on both axis.
@ -391,24 +429,64 @@ class TextNetworks(TextAlignments):
tls_search_space.remove(most_aligned_tl) tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl] tls_in_bbox = [most_aligned_tl]
last_bbox = None last_bbox = None
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
while last_bbox != bbox: while last_bbox != bbox:
if parse_details_search is not None: if parse_details_search is not None:
# Store debug info # Store debug info
parse_details_search["iterations"].append(bbox) parse_details_search["iterations"].append(bbox)
# Check that the closest tls are within the gaps allowed
last_bbox = bbox last_bbox = bbox
# Go through all remaining textlines, expand our bbox cand_bbox = last_bbox.copy()
# if a textline is within our proximity tolerance closest_tls = find_closest_tls(bbox, tls_search_space)
for direction, tl in closest_tls.items():
if tl is None:
continue
expanded_cand_bbox = cand_bbox.copy()
if direction == "left":
if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]:
continue
expanded_cand_bbox[0] = tl.x0
elif direction == "right":
if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
continue
expanded_cand_bbox[2] = tl.x1
elif direction == "bottom":
if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]:
continue
expanded_cand_bbox[1] = tl.y0
elif direction == "top":
if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
continue
expanded_cand_bbox[3] = tl.y1
# If they are, see what an expanded bbox in that direction
# would contain
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
tls_in_new_box = new_tls + tls_in_bbox
# And if we're expanding up or down, check that the addition
# of the new row won't reduce the number of columns.
# This happens when text covers multiple rows - that's only
# allowed in the header, treated separately.
cols_cand = find_columns_coordinates(tls_in_new_box)
if direction in ["bottom", "top"]:
if len(cols_cand) < len(last_cols_cand):
continue
# We have an expansion candidate: register it, update the
# search space and repeat
# We use bbox_from_textlines instead of cand_bbox in case some
# overlapping textlines require a large bbox for strict fit.
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
last_cols_cand = cols_cand
tls_in_bbox.extend(new_tls)
for i in range(len(tls_search_space) - 1, -1, -1): for i in range(len(tls_search_space) - 1, -1, -1):
tl = tls_search_space[i] tl = tls_search_space[i]
h_distance, v_distance = distance_tl_to_bbox(tl, bbox) if tl in new_tls:
# Move textline to our bbox and expand the bbox accordingly
# if the textline is close.
if h_distance < max_h_gap and v_distance < max_v_gap:
tls_in_bbox.append(tl)
bbox = expand_bbox_with_textline(bbox, tl)
del tls_search_space[i] del tls_search_space[i]
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE: if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
return bbox return bbox
return None return None

View File

@ -28,7 +28,8 @@ def draw_labeled_bbox(
ax, bbox, text, ax, bbox, text,
color="black", linewidth=3, color="black", linewidth=3,
linestyle="solid", linestyle="solid",
label_pos="top,left" label_pos="top,left",
fontsize=12,
): ):
"""Utility drawing function to draw a box with an associated text label """Utility drawing function to draw a box with an associated text label
""" """
@ -66,10 +67,10 @@ def draw_labeled_bbox(
ax.text( ax.text(
x, y, x, y,
text, text,
fontsize=12, color="black", fontsize=fontsize, color="black",
verticalalignment=vlabel_out_of_box, verticalalignment=vlabel_out_of_box,
horizontalalignment=hlabel, horizontalalignment=hlabel,
bbox=dict(facecolor=color, alpha=0.3) bbox=dict(facecolor=color, alpha=0.1)
) )
@ -449,12 +450,13 @@ class PlotMethods():
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox, ax, bbox,
"box #{box_id} / iter #{iteration}".format( "t{box_id}/i{iteration}".format(
box_id=box_id, box_id=box_id,
iteration=iteration iteration=iteration
), ),
color="red", color="red",
linewidth=5 if final else 2, linewidth=5 if final else 2,
fontsize=12 if final else 8,
label_pos="bottom,left" label_pos="bottom,left"
) )

View File

@ -432,7 +432,7 @@ def bbox_from_str(bbox_str):
def text_in_bbox(bbox, text): def text_in_bbox(bbox, text):
"""Returns all text objects present inside a bounding box. """Returns all text objects which lie at least 50% inside a bounding box.
Parameters Parameters
---------- ----------
@ -529,7 +529,7 @@ def bbox_from_textlines(textlines):
return bbox return bbox
def find_columns_coordinates(tls): def find_columns_coordinates(tls, min_gap=1.0):
"""Given a list of text objects, guess columns boundaries and returns a """Given a list of text objects, guess columns boundaries and returns a
list of x-coordinates for split points between columns. list of x-coordinates for split points between columns.
@ -537,6 +537,10 @@ def find_columns_coordinates(tls):
---------- ----------
tls : list of PDFMiner text object. tls : list of PDFMiner text object.
min_gap : minimum distance between columns. Any elements closer than this
threshold are merged together. This is to prevent spaces between words
to be misinterpreted as column boundaries.
Returns Returns
------- -------
cols_anchors : list cols_anchors : list
@ -549,7 +553,7 @@ def find_columns_coordinates(tls):
cols_bounds = [] cols_bounds = []
tls.sort(key=lambda tl: tl.x0) tls.sort(key=lambda tl: tl.x0)
for tl in tls: for tl in tls:
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0: if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
cols_bounds.append([tl.x0, tl.x1]) cols_bounds.append([tl.x0, tl.x1])
else: else:
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1) cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
@ -619,44 +623,6 @@ def get_index_closest_point(point, sorted_list, fn=lambda x: x):
return mid return mid
def distance_tl_to_bbox(tl, bbox):
"""Returns a tuple corresponding to the horizontal and vertical gaps
between a textline and a bbox.
Parameters
----------
tl : PDFMiner text object.
bbox : tuple (x0, y0, x1, y1)
Returns
-------
distance : tuple
Tuple (horizontal distance, vertical distance)
"""
v_distance, h_distance = None, None
if tl.x1 <= bbox[0]:
# tl to the left
h_distance = bbox[0] - tl.x1
elif bbox[2] <= tl.x0:
# tl to the right
h_distance = tl.x0 - bbox[2]
else:
# textline overlaps vertically
h_distance = 0
if tl.y1 <= bbox[1]:
# tl below
v_distance = bbox[1] - tl.y1
elif bbox[3] <= tl.y0:
# tl above
v_distance = tl.y0 - bbox[3]
else:
# tl overlaps horizontally
v_distance = 0
return (h_distance, v_distance)
def merge_close_lines(ar, line_tol=2): def merge_close_lines(ar, line_tol=2):
"""Merges lines which are within a tolerance by calculating a """Merges lines which are within a tolerance by calculating a
moving mean, based on their x or y axis projections. moving mean, based on their x or y axis projections.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB