Improve hybrid table body discovery algo
While searching for table body boundaries, exclude rows that include cells crossing previously discovered rows.pull/153/head
parent
3220b02ebc
commit
918416e7e4
|
|
@ -17,10 +17,8 @@ from ..core import (
|
||||||
)
|
)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
expand_bbox_with_textline,
|
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
bbox_from_textlines,
|
bbox_from_textlines,
|
||||||
distance_tl_to_bbox,
|
|
||||||
find_columns_coordinates,
|
find_columns_coordinates,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
)
|
)
|
||||||
|
|
@ -46,6 +44,44 @@ def column_spread(left, right, col_anchors):
|
||||||
return index_right - index_left
|
return index_right - index_left
|
||||||
|
|
||||||
|
|
||||||
|
def find_closest_tls(bbox, tls):
|
||||||
|
""" Search for tls that are the closest but outside in all 4 directions
|
||||||
|
"""
|
||||||
|
closest = {
|
||||||
|
"left": None,
|
||||||
|
"right": None,
|
||||||
|
"top": None,
|
||||||
|
"bottom": None,
|
||||||
|
}
|
||||||
|
(bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
|
||||||
|
for tl in tls:
|
||||||
|
if tl.x1 < bbox_left:
|
||||||
|
# Left: check it overlaps horizontally
|
||||||
|
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
||||||
|
continue
|
||||||
|
if closest["left"] is None or closest["left"].x1 < tl.x1:
|
||||||
|
closest["left"] = tl
|
||||||
|
elif bbox_right < tl.x0:
|
||||||
|
# Right: check it overlaps horizontally
|
||||||
|
if tl.y0 > bbox_top or tl.y1 < bbox_bottom:
|
||||||
|
continue
|
||||||
|
if closest["right"] is None or closest["right"].x0 > tl.x0:
|
||||||
|
closest["right"] = tl
|
||||||
|
else:
|
||||||
|
# Either bottom or top: must overlap vertically
|
||||||
|
if tl.x0 > bbox_right or tl.x1 < bbox_left:
|
||||||
|
continue
|
||||||
|
elif tl.y1 < bbox_bottom:
|
||||||
|
# Bottom
|
||||||
|
if closest["bottom"] is None or closest["bottom"].y1 < tl.y1:
|
||||||
|
closest["bottom"] = tl
|
||||||
|
elif bbox_top < tl.y0:
|
||||||
|
# Top
|
||||||
|
if closest["top"] is None or closest["top"].y0 > tl.y0:
|
||||||
|
closest["top"] = tl
|
||||||
|
return closest
|
||||||
|
|
||||||
|
|
||||||
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
"""Expand a bbox vertically up by looking for plausible headers.
|
"""Expand a bbox vertically up by looking for plausible headers.
|
||||||
|
|
||||||
|
|
@ -346,14 +382,16 @@ class TextNetworks(TextAlignments):
|
||||||
return gaps_hv
|
return gaps_hv
|
||||||
|
|
||||||
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
|
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
|
||||||
""" Seed the process with the textline with the highest alignment
|
""" Build a candidate bbox for the body of a table using hybrid algo
|
||||||
|
|
||||||
|
Seed the process with the textline with the highest alignment
|
||||||
score, then expand the bbox with textlines within threshold.
|
score, then expand the bbox with textlines within threshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
gaps_hv : tuple
|
gaps_hv : tuple
|
||||||
The maximum distance allowed to consider surrounding lines/columns
|
The maximum distance allowed to consider surrounding lines/columns
|
||||||
as part of the same table.
|
as part of the same table.
|
||||||
parse_details : array (optional)
|
parse_details : array (optional)
|
||||||
Optional parameter array, in which to store extra information
|
Optional parameter array, in which to store extra information
|
||||||
to help later visualization of the table creation.
|
to help later visualization of the table creation.
|
||||||
|
|
@ -381,8 +419,8 @@ class TextNetworks(TextAlignments):
|
||||||
else:
|
else:
|
||||||
parse_details_search = None
|
parse_details_search = None
|
||||||
|
|
||||||
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
|
||||||
most_aligned_tl.x1, most_aligned_tl.y1)
|
most_aligned_tl.x1, most_aligned_tl.y1]
|
||||||
|
|
||||||
# For the body of the table, we only consider cells that have
|
# For the body of the table, we only consider cells that have
|
||||||
# alignments on both axis.
|
# alignments on both axis.
|
||||||
|
|
@ -391,24 +429,64 @@ class TextNetworks(TextAlignments):
|
||||||
tls_search_space.remove(most_aligned_tl)
|
tls_search_space.remove(most_aligned_tl)
|
||||||
tls_in_bbox = [most_aligned_tl]
|
tls_in_bbox = [most_aligned_tl]
|
||||||
last_bbox = None
|
last_bbox = None
|
||||||
|
last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
|
||||||
while last_bbox != bbox:
|
while last_bbox != bbox:
|
||||||
if parse_details_search is not None:
|
if parse_details_search is not None:
|
||||||
# Store debug info
|
# Store debug info
|
||||||
parse_details_search["iterations"].append(bbox)
|
parse_details_search["iterations"].append(bbox)
|
||||||
|
|
||||||
|
# Check that the closest tls are within the gaps allowed
|
||||||
last_bbox = bbox
|
last_bbox = bbox
|
||||||
# Go through all remaining textlines, expand our bbox
|
cand_bbox = last_bbox.copy()
|
||||||
# if a textline is within our proximity tolerance
|
closest_tls = find_closest_tls(bbox, tls_search_space)
|
||||||
for i in range(len(tls_search_space) - 1, -1, -1):
|
for direction, tl in closest_tls.items():
|
||||||
tl = tls_search_space[i]
|
if tl is None:
|
||||||
h_distance, v_distance = distance_tl_to_bbox(tl, bbox)
|
continue
|
||||||
|
expanded_cand_bbox = cand_bbox.copy()
|
||||||
|
|
||||||
|
if direction == "left":
|
||||||
|
if expanded_cand_bbox[0] - tl.x1 > gaps_hv[0]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[0] = tl.x0
|
||||||
|
elif direction == "right":
|
||||||
|
if tl.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[2] = tl.x1
|
||||||
|
elif direction == "bottom":
|
||||||
|
if expanded_cand_bbox[1] - tl.y1 > gaps_hv[1]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[1] = tl.y0
|
||||||
|
elif direction == "top":
|
||||||
|
if tl.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
|
||||||
|
continue
|
||||||
|
expanded_cand_bbox[3] = tl.y1
|
||||||
|
|
||||||
|
# If they are, see what an expanded bbox in that direction
|
||||||
|
# would contain
|
||||||
|
new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
|
||||||
|
tls_in_new_box = new_tls + tls_in_bbox
|
||||||
|
|
||||||
|
# And if we're expanding up or down, check that the addition
|
||||||
|
# of the new row won't reduce the number of columns.
|
||||||
|
# This happens when text covers multiple rows - that's only
|
||||||
|
# allowed in the header, treated separately.
|
||||||
|
cols_cand = find_columns_coordinates(tls_in_new_box)
|
||||||
|
if direction in ["bottom", "top"]:
|
||||||
|
if len(cols_cand) < len(last_cols_cand):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We have an expansion candidate: register it, update the
|
||||||
|
# search space and repeat
|
||||||
|
# We use bbox_from_textlines instead of cand_bbox in case some
|
||||||
|
# overlapping textlines require a large bbox for strict fit.
|
||||||
|
bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
|
||||||
|
last_cols_cand = cols_cand
|
||||||
|
tls_in_bbox.extend(new_tls)
|
||||||
|
for i in range(len(tls_search_space) - 1, -1, -1):
|
||||||
|
tl = tls_search_space[i]
|
||||||
|
if tl in new_tls:
|
||||||
|
del tls_search_space[i]
|
||||||
|
|
||||||
# Move textline to our bbox and expand the bbox accordingly
|
|
||||||
# if the textline is close.
|
|
||||||
if h_distance < max_h_gap and v_distance < max_v_gap:
|
|
||||||
tls_in_bbox.append(tl)
|
|
||||||
bbox = expand_bbox_with_textline(bbox, tl)
|
|
||||||
del tls_search_space[i]
|
|
||||||
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
||||||
return bbox
|
return bbox
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,8 @@ def draw_labeled_bbox(
|
||||||
ax, bbox, text,
|
ax, bbox, text,
|
||||||
color="black", linewidth=3,
|
color="black", linewidth=3,
|
||||||
linestyle="solid",
|
linestyle="solid",
|
||||||
label_pos="top,left"
|
label_pos="top,left",
|
||||||
|
fontsize=12,
|
||||||
):
|
):
|
||||||
"""Utility drawing function to draw a box with an associated text label
|
"""Utility drawing function to draw a box with an associated text label
|
||||||
"""
|
"""
|
||||||
|
|
@ -66,10 +67,10 @@ def draw_labeled_bbox(
|
||||||
ax.text(
|
ax.text(
|
||||||
x, y,
|
x, y,
|
||||||
text,
|
text,
|
||||||
fontsize=12, color="black",
|
fontsize=fontsize, color="black",
|
||||||
verticalalignment=vlabel_out_of_box,
|
verticalalignment=vlabel_out_of_box,
|
||||||
horizontalalignment=hlabel,
|
horizontalalignment=hlabel,
|
||||||
bbox=dict(facecolor=color, alpha=0.3)
|
bbox=dict(facecolor=color, alpha=0.1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -449,12 +450,13 @@ class PlotMethods():
|
||||||
|
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, bbox,
|
ax, bbox,
|
||||||
"box #{box_id} / iter #{iteration}".format(
|
"t{box_id}/i{iteration}".format(
|
||||||
box_id=box_id,
|
box_id=box_id,
|
||||||
iteration=iteration
|
iteration=iteration
|
||||||
),
|
),
|
||||||
color="red",
|
color="red",
|
||||||
linewidth=5 if final else 2,
|
linewidth=5 if final else 2,
|
||||||
|
fontsize=12 if final else 8,
|
||||||
label_pos="bottom,left"
|
label_pos="bottom,left"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -432,7 +432,7 @@ def bbox_from_str(bbox_str):
|
||||||
|
|
||||||
|
|
||||||
def text_in_bbox(bbox, text):
|
def text_in_bbox(bbox, text):
|
||||||
"""Returns all text objects present inside a bounding box.
|
"""Returns all text objects which lie at least 50% inside a bounding box.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -529,7 +529,7 @@ def bbox_from_textlines(textlines):
|
||||||
return bbox
|
return bbox
|
||||||
|
|
||||||
|
|
||||||
def find_columns_coordinates(tls):
|
def find_columns_coordinates(tls, min_gap=1.0):
|
||||||
"""Given a list of text objects, guess columns boundaries and returns a
|
"""Given a list of text objects, guess columns boundaries and returns a
|
||||||
list of x-coordinates for split points between columns.
|
list of x-coordinates for split points between columns.
|
||||||
|
|
||||||
|
|
@ -537,6 +537,10 @@ def find_columns_coordinates(tls):
|
||||||
----------
|
----------
|
||||||
tls : list of PDFMiner text object.
|
tls : list of PDFMiner text object.
|
||||||
|
|
||||||
|
min_gap : minimum distance between columns. Any elements closer than this
|
||||||
|
threshold are merged together. This is to prevent spaces between words
|
||||||
|
to be misinterpreted as column boundaries.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cols_anchors : list
|
cols_anchors : list
|
||||||
|
|
@ -549,7 +553,7 @@ def find_columns_coordinates(tls):
|
||||||
cols_bounds = []
|
cols_bounds = []
|
||||||
tls.sort(key=lambda tl: tl.x0)
|
tls.sort(key=lambda tl: tl.x0)
|
||||||
for tl in tls:
|
for tl in tls:
|
||||||
if (not cols_bounds) or cols_bounds[-1][1] < tl.x0:
|
if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
|
||||||
cols_bounds.append([tl.x0, tl.x1])
|
cols_bounds.append([tl.x0, tl.x1])
|
||||||
else:
|
else:
|
||||||
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
|
||||||
|
|
@ -619,44 +623,6 @@ def get_index_closest_point(point, sorted_list, fn=lambda x: x):
|
||||||
return mid
|
return mid
|
||||||
|
|
||||||
|
|
||||||
def distance_tl_to_bbox(tl, bbox):
|
|
||||||
"""Returns a tuple corresponding to the horizontal and vertical gaps
|
|
||||||
between a textline and a bbox.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
tl : PDFMiner text object.
|
|
||||||
bbox : tuple (x0, y0, x1, y1)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
distance : tuple
|
|
||||||
Tuple (horizontal distance, vertical distance)
|
|
||||||
|
|
||||||
"""
|
|
||||||
v_distance, h_distance = None, None
|
|
||||||
if tl.x1 <= bbox[0]:
|
|
||||||
# tl to the left
|
|
||||||
h_distance = bbox[0] - tl.x1
|
|
||||||
elif bbox[2] <= tl.x0:
|
|
||||||
# tl to the right
|
|
||||||
h_distance = tl.x0 - bbox[2]
|
|
||||||
else:
|
|
||||||
# textline overlaps vertically
|
|
||||||
h_distance = 0
|
|
||||||
|
|
||||||
if tl.y1 <= bbox[1]:
|
|
||||||
# tl below
|
|
||||||
v_distance = bbox[1] - tl.y1
|
|
||||||
elif bbox[3] <= tl.y0:
|
|
||||||
# tl above
|
|
||||||
v_distance = tl.y0 - bbox[3]
|
|
||||||
else:
|
|
||||||
# tl overlaps horizontally
|
|
||||||
v_distance = 0
|
|
||||||
return (h_distance, v_distance)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_close_lines(ar, line_tol=2):
|
def merge_close_lines(ar, line_tol=2):
|
||||||
"""Merges lines which are within a tolerance by calculating a
|
"""Merges lines which are within a tolerance by calculating a
|
||||||
moving mean, based on their x or y axis projections.
|
moving mean, based on their x or y axis projections.
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 103 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 88 KiB After Width: | Height: | Size: 88 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 90 KiB |
Loading…
Reference in New Issue