Prettier plotting, improve gaps calculation

pull/153/head
Frh 2020-04-22 14:08:22 -07:00
parent d2cf8520cb
commit 1a47c3df89
10 changed files with 211 additions and 130 deletions

View File

@ -22,25 +22,8 @@ from matplotlib import patches as patches
MAX_COL_SPREAD_IN_HEADER = 3 MAX_COL_SPREAD_IN_HEADER = 3
def plot_annotated_bbox(plot, bbox, text, rect_color):
plot.add_patch(
patches.Rectangle(
(bbox[0], bbox[1]),
bbox[2] - bbox[0], bbox[3] - bbox[1],
color="purple", linewidth=3,
fill=False
)
)
plot.text(
bbox[0], bbox[1],
text,
fontsize=12, color="black", verticalalignment="top",
bbox=dict(facecolor="purple", alpha=0.5)
)
def todo_move_me_expand_area_for_header(area, textlines, col_anchors, def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
average_row_height): max_v_gap):
"""The core algorithm is based on fairly strict alignment of text. """The core algorithm is based on fairly strict alignment of text.
It works ok for the table body, but might fail on tables' headers It works ok for the table body, but might fail on tables' headers
since they tend to be in a different font, alignment (e.g. vertical), since they tend to be in a different font, alignment (e.g. vertical),
@ -78,13 +61,13 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
all_above = [] all_above = []
for te in textlines: for te in textlines:
# higher than the table, directly within its bounds # higher than the table, directly within its bounds
if te.y0 > top and te.x0 > left and te.x1 < right: if te.y0 > top and te.x0 >= left and te.x1 <= right:
all_above.append(te) all_above.append(te)
if closest_above is None or closest_above.y0 > te.y0: if closest_above is None or closest_above.y0 > te.y0:
closest_above = te closest_above = te
if closest_above and \ if closest_above and \
closest_above.y0 < top + average_row_height: closest_above.y0 < top + max_v_gap:
# b/ We have a candidate cell that is within the correct # b/ We have a candidate cell that is within the correct
# vertical band, and directly above the table. Starting from # vertical band, and directly above the table. Starting from
# this anchor, we list all the textlines within the same row. # this anchor, we list all the textlines within the same row.
@ -475,37 +458,42 @@ class TextEdges2(object):
self._textlines_alignments = {} self._textlines_alignments = {}
self._compute_alignment_counts() self._compute_alignment_counts()
def _build_bbox_candidate(self, debug_info=None): def _most_connected_textline(self):
""" Seed the process with the textline with the highest alignment """ Retrieve the textline that is most connected across vertical and
score, then expand the bbox with textlines within threshold. horizontal axis.
Parameters
----------
debug_info : array
Optional parameter array, in which to store extra information
to help later visualization of the table creation.
""" """
if self.max_rows <= 1 or self.max_cols <= 1:
return None
tls_search_space = list(self._textlines_alignments.keys())
def get_best_textline(textlines):
# Find the textline with the highest alignment score # Find the textline with the highest alignment score
return max( return max(
textlines, self._textlines_alignments.keys(),
key=lambda textline: key=lambda textline:
self._textlines_alignments[textline].alignment_score(), self._textlines_alignments[textline].alignment_score(),
default=None default=None
) )
# First, determine the textline that has the most combined alignments def _compute_plausible_gaps(self):
# across horizontal and vertical axis. """ Evaluate plausible gaps between cells horizontally and vertically
# It will serve both as a starting point for the table boundary search, based on the textlines aligned with the most connected textline.
# and as a way to estimate the average spacing between rows/cols.
most_aligned_tl = get_best_textline(tls_search_space)
most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl)
# Retrieve the list of textlines it's aligned with, across both axis Returns
-------
gaps_hv : tuple
(horizontal_gap, horizontal_gap) in pdf coordinate space.
"""
if self.max_rows <= 1 or self.max_cols <= 1:
return None
# Determine the textline that has the most combined
# alignments across horizontal and vertical axis.
# It will serve as a reference axis along which to collect the average
# spacing between rows/cols.
most_aligned_tl = self._most_connected_textline()
most_aligned_coords = TextEdges2.get_textline_coords(
most_aligned_tl)
# Retrieve the list of textlines it's aligned with, across both
# axis
best_alignment = self._textlines_alignments[most_aligned_tl] best_alignment = self._textlines_alignments[most_aligned_tl]
ref_h_edge_name = best_alignment.max_h_edge_name() ref_h_edge_name = best_alignment.max_h_edge_name()
ref_v_edge_name = best_alignment.max_v_edge_name() ref_v_edge_name = best_alignment.max_v_edge_name()
@ -544,9 +532,30 @@ class TextEdges2(object):
return None return None
percentile = 75 percentile = 75
gaps_hv = ( gaps_hv = (
np.percentile(h_gaps, percentile), 2.0 * np.percentile(h_gaps, percentile),
np.percentile(v_gaps, percentile) 2.0 * np.percentile(v_gaps, percentile)
) )
return gaps_hv
def _build_bbox_candidate(self, gaps_hv, debug_info=None):
""" Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold.
Parameters
----------
gaps_hv : tuple
The maximum distance allowed to consider surrounding lines/columns
as part of the same table.
debug_info : array (optional)
Optional parameter array, in which to store extra information
to help later visualization of the table creation.
"""
# First, determine the textline that has the most combined
# alignments across horizontal and vertical axis.
# It will serve both as a starting point for the table boundary
# search, and as a way to estimate the average spacing between
# rows/cols.
most_aligned_tl = self._most_connected_textline()
# Calculate the 75th percentile of the horizontal/vertical # Calculate the 75th percentile of the horizontal/vertical
# gaps between textlines. Use this as a reference for a threshold # gaps between textlines. Use this as a reference for a threshold
@ -555,7 +564,7 @@ class TextEdges2(object):
# gaps_hv = self._calculate_gaps_thresholds(75) # gaps_hv = self._calculate_gaps_thresholds(75)
# if (gaps_hv[0] is None or gaps_hv[1] is None): # if (gaps_hv[0] is None or gaps_hv[1] is None):
# return None # return None
max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3 max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
if debug_info is not None: if debug_info is not None:
# Store debug info # Store debug info
@ -571,6 +580,11 @@ class TextEdges2(object):
MINIMUM_TEXTLINES_IN_TABLE = 6 MINIMUM_TEXTLINES_IN_TABLE = 6
bbox = (most_aligned_tl.x0, most_aligned_tl.y0, bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
most_aligned_tl.x1, most_aligned_tl.y1) most_aligned_tl.x1, most_aligned_tl.y1)
# For the body of the table, we only consider cells with alignments
# on both axis.
tls_search_space = list(self._textlines_alignments.keys())
# tls_search_space = []
tls_search_space.remove(most_aligned_tl) tls_search_space.remove(most_aligned_tl)
tls_in_bbox = [most_aligned_tl] tls_in_bbox = [most_aligned_tl]
last_bbox = None last_bbox = None
@ -639,57 +653,6 @@ class TextEdges2(object):
color="black" color="black"
) )
def plotFRHTableSearch(self, plot, debug_info):
if debug_info is None:
return
# Display a bbox per region
for region_str in debug_info["table_regions"] or []:
plot_annotated_bbox(
plot, bbox_from_str(region_str),
"region: ({region_str})".format(region_str=region_str),
"purple"
)
# Display a bbox per area
for area_str in debug_info["table_areas"] or []:
plot_annotated_bbox(
plot, bbox_from_str(area_str),
"area: ({area_str})".format(area_str=area_str), "pink"
)
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
max_h_gap = bbox_search["max_h_gap"]
max_v_gap = bbox_search["max_v_gap"]
iterations = bbox_search["iterations"]
for iteration, bbox in enumerate(iterations):
final = iteration == len(iterations) - 1
plot.add_patch(
patches.Rectangle(
(bbox[0], bbox[1]),
bbox[2] - bbox[0], bbox[3] - bbox[1],
color="red",
linewidth=5 if final else 2,
fill=False
)
)
plot.text(
bbox[0],
bbox[1],
f"box #{box_id+1} / iter #{iteration}",
fontsize=12,
color="black",
verticalalignment="top",
bbox=dict(facecolor="orange", alpha=0.5)
)
plot.add_patch(
patches.Rectangle(
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
bbox[2] - bbox[0] + 2 * max_h_gap,
bbox[3] - bbox[1] + 2 * max_v_gap,
color="orange",
fill=False
)
)
class Hybrid(BaseParser): class Hybrid(BaseParser):
"""Hybrid method of parsing looks for spaces between text """Hybrid method of parsing looks for spaces between text
@ -738,7 +701,7 @@ class Hybrid(BaseParser):
flag_size=False, flag_size=False,
split_text=False, split_text=False,
strip_text="", strip_text="",
edge_tol=50, edge_tol=None,
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
debug=False, debug=False,
@ -754,6 +717,8 @@ class Hybrid(BaseParser):
debug=debug debug=debug
) )
self.columns = columns self.columns = columns
self.textedges = None
self._validate_columns() self._validate_columns()
self.edge_tol = edge_tol self.edge_tol = edge_tol
self.row_tol = row_tol self.row_tol = row_tol
@ -973,7 +938,11 @@ class Hybrid(BaseParser):
self.table_bbox = table_bbox self.table_bbox = table_bbox
return return
all_textlines = self.horizontal_text + self.vertical_text # Take all the textlines that are not just spaces
all_textlines = [
t for t in self.horizontal_text + self.vertical_text
if len(t.get_text().strip()) > 0
]
textlines = self._apply_regions_filter(all_textlines) textlines = self._apply_regions_filter(all_textlines)
textlines_processed = {} textlines_processed = {}
@ -996,8 +965,15 @@ class Hybrid(BaseParser):
debug_info_edges_searches.append( debug_info_edges_searches.append(
copy.deepcopy(self.textedges) copy.deepcopy(self.textedges)
) )
gaps_hv = self.textedges._compute_plausible_gaps()
if gaps_hv is None:
return None
if self.edge_tol is not None:
# edge_tol instructions override the calculated vertical gap
gaps_hv = (gaps_hv[0], self.edge_tol)
bbox = self.textedges._build_bbox_candidate( bbox = self.textedges._build_bbox_candidate(
debug_info_bboxes_searches gaps_hv,
debug_info=debug_info_bboxes_searches
) )
if bbox is None: if bbox is None:
break break
@ -1028,7 +1004,7 @@ class Hybrid(BaseParser):
bbox, bbox,
textlines, textlines,
cols_anchors, cols_anchors,
average_tl_height gaps_hv[1] # average_tl_height
) )
if self.debug_info is not None: if self.debug_info is not None:

View File

@ -11,20 +11,50 @@ else:
from .utils import bbox_from_str from .utils import bbox_from_str
def draw_labeled_bbox(ax, bbox, text, rect_color): def draw_labeled_bbox(
ax, bbox, text,
color="black", linewidth=3,
linestyle="solid",
label_pos="top,left"
):
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(bbox[0], bbox[1]), (bbox[0], bbox[1]),
bbox[2] - bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1],
color="purple", linewidth=3, color=color,
linewidth=linewidth, linestyle=linestyle,
fill=False fill=False
) )
) )
vlabel, hlabel = label_pos.split(",")
if (vlabel == "top"):
y = max(bbox[1], bbox[3])
elif (vlabel == "bottom"):
y = min(bbox[1], bbox[3])
else:
y = 0.5 * (bbox[1] + bbox[3])
# We want to draw the label outside the box (above or below)
label_align_swap = {
"top": "bottom",
"bottom": "top",
"center": "center"
}
vlabel_out_of_box = label_align_swap[vlabel]
if (hlabel == "right"):
x = max(bbox[0], bbox[2])
elif (hlabel == "left"):
x = min(bbox[0], bbox[2])
else:
x = 0.5 * (bbox[0] + bbox[2])
ax.text( ax.text(
bbox[0], bbox[1], x, y,
text, text,
fontsize=12, color="black", verticalalignment="top", fontsize=12, color="black",
bbox=dict(facecolor="purple", alpha=0.5) verticalalignment=vlabel_out_of_box,
horizontalalignment=hlabel,
bbox=dict(facecolor=color, alpha=0.3)
) )
@ -46,21 +76,6 @@ def draw_pdf(table, ax, to_pdf_scale=True):
else: else:
ax.imshow(img) ax.imshow(img)
if table.debug_info:
# Display a bbox per region
for region_str in table.debug_info["table_regions"] or []:
draw_labeled_bbox(
ax, bbox_from_str(region_str),
"region: ({region_str})".format(region_str=region_str),
"purple"
)
# Display a bbox per area
for area_str in table.debug_info["table_areas"] or []:
draw_labeled_bbox(
ax, bbox_from_str(area_str),
"area: ({area_str})".format(area_str=area_str), "pink"
)
def draw_parse_constraints(table, ax): def draw_parse_constraints(table, ax):
"""Draw any user provided constraints (area, region, columns, etc) """Draw any user provided constraints (area, region, columns, etc)
@ -78,13 +93,20 @@ def draw_parse_constraints(table, ax):
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox_from_str(region_str), ax, bbox_from_str(region_str),
"region: ({region_str})".format(region_str=region_str), "region: ({region_str})".format(region_str=region_str),
"purple" color="purple",
linestyle="dotted",
linewidth=1,
label_pos="bottom,right"
) )
# Display a bbox per area # Display a bbox per area
for area_str in table.debug_info["table_areas"] or []: for area_str in table.debug_info["table_areas"] or []:
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox_from_str(area_str), ax, bbox_from_str(area_str),
"area: ({area_str})".format(area_str=area_str), "pink" "area: ({area_str})".format(area_str=area_str),
color="pink",
linestyle="dotted",
linewidth=1,
label_pos="bottom,right"
) )
@ -220,7 +242,9 @@ class PlotMethods(object):
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue" (t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.5
) )
) )
@ -329,3 +353,79 @@ class PlotMethods(object):
for h in horizontal: for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
return fig return fig
@staticmethod
def hybrid_table_search(table):
"""Generates a plot illustrating the steps of the hybrid table search.
Parameters
----------
table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
if table.debug_info is None:
return fig
debug_info = table.debug_info
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
max_h_gap = bbox_search["max_h_gap"]
max_v_gap = bbox_search["max_v_gap"]
iterations = bbox_search["iterations"]
for iteration, bbox in enumerate(iterations):
final = iteration == len(iterations) - 1
draw_labeled_bbox(
ax, bbox,
"box #{box_id} / iter #{iteration}".format(
box_id=box_id,
iteration=iteration
),
color="red",
linewidth=5 if final else 2,
label_pos="bottom,left"
)
ax.add_patch(
patches.Rectangle(
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
bbox[2] - bbox[0] + 2 * max_h_gap,
bbox[3] - bbox[1] + 2 * max_v_gap,
color="orange",
fill=False
)
)
for box_id, col_search in enumerate(debug_info["col_searches"]):
draw_labeled_bbox(
ax, col_search["expanded_bbox"],
"box body + header #{box_id}".format(
box_id=box_id
),
color="red",
linewidth=4,
label_pos="top,left"
)
draw_labeled_bbox(
ax, col_search["core_bbox"],
"box body #{box_id}".format(
box_id=box_id
),
color="orange",
linewidth=2,
label_pos="bottom,left"
)
# self.debug_info["col_searches"].append({
# "core_bbox": bbox,
# "cols_anchors": cols_anchors,
# "expanded_bbox": expanded_bbox
# })
return fig

View File

@ -1115,10 +1115,10 @@ def compare_tables(left, right):
differences_str = " and ".join(differences) differences_str = " and ".join(differences)
print( print(
"Right has {differences_str} than left " "Right has {differences_str} than left "
"{shape_right} vs {shape_left}".format( "{shape_left} vs {shape_right}".format(
differences_str=differences_str, differences_str=differences_str,
shape_left=[left.shape[0], left.shape[1]],
shape_right=[right.shape[0], right.shape[1]], shape_right=[right.shape[0], right.shape[1]],
shape_left=[left.shape[0], left.shape[1]]
) )
) )

View File

@ -2442,6 +2442,10 @@ data_stream_edge_tol = [
["period.", ""], ["period.", ""],
] ]
# The stream algorithm ends up including a footer, which hybrid correctly
# skips.
data_hybrid_edge_tol = data_stream_edge_tol[:-3]
data_lattice = [ data_lattice = [
[ [
"Cycle \nName", "Cycle \nName",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 98 KiB

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 98 KiB

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 197 KiB

After

Width:  |  Height:  |  Size: 197 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

@ -7,6 +7,7 @@ from pandas.testing import assert_frame_equal
import camelot import camelot
from camelot.core import Table, TableList from camelot.core import Table, TableList
from camelot.utils import compare_tables
from camelot.__version__ import generate_version from camelot.__version__ import generate_version
from .data import * from .data import *
@ -193,7 +194,7 @@ def test_hybrid_table_regions():
# The "stream" test looks for a region in ["320,460,573,335"], which # The "stream" test looks for a region in ["320,460,573,335"], which
# should exclude the header. # should exclude the header.
tables = camelot.read_pdf( tables = camelot.read_pdf(
filename, flavor="hybrid", table_regions=["320,505,573,330"] filename, flavor="hybrid", table_regions=["320,335,573,505"]
) )
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
@ -248,7 +249,7 @@ def test_hybrid_strip_text():
def test_hybrid_edge_tol(): def test_hybrid_edge_tol():
df = pd.DataFrame(data_stream_edge_tol) df = pd.DataFrame(data_hybrid_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf") filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500) tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)