Prettier plotting, improve gaps calculation
|
|
@ -22,25 +22,8 @@ from matplotlib import patches as patches
|
||||||
MAX_COL_SPREAD_IN_HEADER = 3
|
MAX_COL_SPREAD_IN_HEADER = 3
|
||||||
|
|
||||||
|
|
||||||
def plot_annotated_bbox(plot, bbox, text, rect_color):
|
|
||||||
plot.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(bbox[0], bbox[1]),
|
|
||||||
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
|
||||||
color="purple", linewidth=3,
|
|
||||||
fill=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
plot.text(
|
|
||||||
bbox[0], bbox[1],
|
|
||||||
text,
|
|
||||||
fontsize=12, color="black", verticalalignment="top",
|
|
||||||
bbox=dict(facecolor="purple", alpha=0.5)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||||
average_row_height):
|
max_v_gap):
|
||||||
"""The core algorithm is based on fairly strict alignment of text.
|
"""The core algorithm is based on fairly strict alignment of text.
|
||||||
It works ok for the table body, but might fail on tables' headers
|
It works ok for the table body, but might fail on tables' headers
|
||||||
since they tend to be in a different font, alignment (e.g. vertical),
|
since they tend to be in a different font, alignment (e.g. vertical),
|
||||||
|
|
@ -78,13 +61,13 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||||
all_above = []
|
all_above = []
|
||||||
for te in textlines:
|
for te in textlines:
|
||||||
# higher than the table, directly within its bounds
|
# higher than the table, directly within its bounds
|
||||||
if te.y0 > top and te.x0 > left and te.x1 < right:
|
if te.y0 > top and te.x0 >= left and te.x1 <= right:
|
||||||
all_above.append(te)
|
all_above.append(te)
|
||||||
if closest_above is None or closest_above.y0 > te.y0:
|
if closest_above is None or closest_above.y0 > te.y0:
|
||||||
closest_above = te
|
closest_above = te
|
||||||
|
|
||||||
if closest_above and \
|
if closest_above and \
|
||||||
closest_above.y0 < top + average_row_height:
|
closest_above.y0 < top + max_v_gap:
|
||||||
# b/ We have a candidate cell that is within the correct
|
# b/ We have a candidate cell that is within the correct
|
||||||
# vertical band, and directly above the table. Starting from
|
# vertical band, and directly above the table. Starting from
|
||||||
# this anchor, we list all the textlines within the same row.
|
# this anchor, we list all the textlines within the same row.
|
||||||
|
|
@ -475,37 +458,42 @@ class TextEdges2(object):
|
||||||
self._textlines_alignments = {}
|
self._textlines_alignments = {}
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
def _build_bbox_candidate(self, debug_info=None):
|
def _most_connected_textline(self):
|
||||||
""" Seed the process with the textline with the highest alignment
|
""" Retrieve the textline that is most connected across vertical and
|
||||||
score, then expand the bbox with textlines within threshold.
|
horizontal axis.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Find the textline with the highest alignment score
|
||||||
|
return max(
|
||||||
|
self._textlines_alignments.keys(),
|
||||||
|
key=lambda textline:
|
||||||
|
self._textlines_alignments[textline].alignment_score(),
|
||||||
|
default=None
|
||||||
|
)
|
||||||
|
|
||||||
|
def _compute_plausible_gaps(self):
|
||||||
|
""" Evaluate plausible gaps between cells horizontally and vertically
|
||||||
|
based on the textlines aligned with the most connected textline.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
gaps_hv : tuple
|
||||||
|
(horizontal_gap, horizontal_gap) in pdf coordinate space.
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
debug_info : array
|
|
||||||
Optional parameter array, in which to store extra information
|
|
||||||
to help later visualization of the table creation.
|
|
||||||
"""
|
"""
|
||||||
if self.max_rows <= 1 or self.max_cols <= 1:
|
if self.max_rows <= 1 or self.max_cols <= 1:
|
||||||
return None
|
return None
|
||||||
tls_search_space = list(self._textlines_alignments.keys())
|
|
||||||
|
|
||||||
def get_best_textline(textlines):
|
# Determine the textline that has the most combined
|
||||||
# Find the textline with the highest alignment score
|
# alignments across horizontal and vertical axis.
|
||||||
return max(
|
# It will serve as a reference axis along which to collect the average
|
||||||
textlines,
|
# spacing between rows/cols.
|
||||||
key=lambda textline:
|
most_aligned_tl = self._most_connected_textline()
|
||||||
self._textlines_alignments[textline].alignment_score(),
|
most_aligned_coords = TextEdges2.get_textline_coords(
|
||||||
default=None
|
most_aligned_tl)
|
||||||
)
|
|
||||||
|
|
||||||
# First, determine the textline that has the most combined alignments
|
# Retrieve the list of textlines it's aligned with, across both
|
||||||
# across horizontal and vertical axis.
|
# axis
|
||||||
# It will serve both as a starting point for the table boundary search,
|
|
||||||
# and as a way to estimate the average spacing between rows/cols.
|
|
||||||
most_aligned_tl = get_best_textline(tls_search_space)
|
|
||||||
most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl)
|
|
||||||
|
|
||||||
# Retrieve the list of textlines it's aligned with, across both axis
|
|
||||||
best_alignment = self._textlines_alignments[most_aligned_tl]
|
best_alignment = self._textlines_alignments[most_aligned_tl]
|
||||||
ref_h_edge_name = best_alignment.max_h_edge_name()
|
ref_h_edge_name = best_alignment.max_h_edge_name()
|
||||||
ref_v_edge_name = best_alignment.max_v_edge_name()
|
ref_v_edge_name = best_alignment.max_v_edge_name()
|
||||||
|
|
@ -544,9 +532,30 @@ class TextEdges2(object):
|
||||||
return None
|
return None
|
||||||
percentile = 75
|
percentile = 75
|
||||||
gaps_hv = (
|
gaps_hv = (
|
||||||
np.percentile(h_gaps, percentile),
|
2.0 * np.percentile(h_gaps, percentile),
|
||||||
np.percentile(v_gaps, percentile)
|
2.0 * np.percentile(v_gaps, percentile)
|
||||||
)
|
)
|
||||||
|
return gaps_hv
|
||||||
|
|
||||||
|
def _build_bbox_candidate(self, gaps_hv, debug_info=None):
|
||||||
|
""" Seed the process with the textline with the highest alignment
|
||||||
|
score, then expand the bbox with textlines within threshold.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
gaps_hv : tuple
|
||||||
|
The maximum distance allowed to consider surrounding lines/columns
|
||||||
|
as part of the same table.
|
||||||
|
debug_info : array (optional)
|
||||||
|
Optional parameter array, in which to store extra information
|
||||||
|
to help later visualization of the table creation.
|
||||||
|
"""
|
||||||
|
# First, determine the textline that has the most combined
|
||||||
|
# alignments across horizontal and vertical axis.
|
||||||
|
# It will serve both as a starting point for the table boundary
|
||||||
|
# search, and as a way to estimate the average spacing between
|
||||||
|
# rows/cols.
|
||||||
|
most_aligned_tl = self._most_connected_textline()
|
||||||
|
|
||||||
# Calculate the 75th percentile of the horizontal/vertical
|
# Calculate the 75th percentile of the horizontal/vertical
|
||||||
# gaps between textlines. Use this as a reference for a threshold
|
# gaps between textlines. Use this as a reference for a threshold
|
||||||
|
|
@ -555,7 +564,7 @@ class TextEdges2(object):
|
||||||
# gaps_hv = self._calculate_gaps_thresholds(75)
|
# gaps_hv = self._calculate_gaps_thresholds(75)
|
||||||
# if (gaps_hv[0] is None or gaps_hv[1] is None):
|
# if (gaps_hv[0] is None or gaps_hv[1] is None):
|
||||||
# return None
|
# return None
|
||||||
max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3
|
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
|
||||||
|
|
||||||
if debug_info is not None:
|
if debug_info is not None:
|
||||||
# Store debug info
|
# Store debug info
|
||||||
|
|
@ -571,6 +580,11 @@ class TextEdges2(object):
|
||||||
MINIMUM_TEXTLINES_IN_TABLE = 6
|
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||||
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
||||||
most_aligned_tl.x1, most_aligned_tl.y1)
|
most_aligned_tl.x1, most_aligned_tl.y1)
|
||||||
|
|
||||||
|
# For the body of the table, we only consider cells with alignments
|
||||||
|
# on both axis.
|
||||||
|
tls_search_space = list(self._textlines_alignments.keys())
|
||||||
|
# tls_search_space = []
|
||||||
tls_search_space.remove(most_aligned_tl)
|
tls_search_space.remove(most_aligned_tl)
|
||||||
tls_in_bbox = [most_aligned_tl]
|
tls_in_bbox = [most_aligned_tl]
|
||||||
last_bbox = None
|
last_bbox = None
|
||||||
|
|
@ -639,57 +653,6 @@ class TextEdges2(object):
|
||||||
color="black"
|
color="black"
|
||||||
)
|
)
|
||||||
|
|
||||||
def plotFRHTableSearch(self, plot, debug_info):
|
|
||||||
if debug_info is None:
|
|
||||||
return
|
|
||||||
# Display a bbox per region
|
|
||||||
for region_str in debug_info["table_regions"] or []:
|
|
||||||
plot_annotated_bbox(
|
|
||||||
plot, bbox_from_str(region_str),
|
|
||||||
"region: ({region_str})".format(region_str=region_str),
|
|
||||||
"purple"
|
|
||||||
)
|
|
||||||
# Display a bbox per area
|
|
||||||
for area_str in debug_info["table_areas"] or []:
|
|
||||||
plot_annotated_bbox(
|
|
||||||
plot, bbox_from_str(area_str),
|
|
||||||
"area: ({area_str})".format(area_str=area_str), "pink"
|
|
||||||
)
|
|
||||||
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
|
|
||||||
max_h_gap = bbox_search["max_h_gap"]
|
|
||||||
max_v_gap = bbox_search["max_v_gap"]
|
|
||||||
iterations = bbox_search["iterations"]
|
|
||||||
for iteration, bbox in enumerate(iterations):
|
|
||||||
final = iteration == len(iterations) - 1
|
|
||||||
plot.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(bbox[0], bbox[1]),
|
|
||||||
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
|
||||||
color="red",
|
|
||||||
linewidth=5 if final else 2,
|
|
||||||
fill=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
plot.text(
|
|
||||||
bbox[0],
|
|
||||||
bbox[1],
|
|
||||||
f"box #{box_id+1} / iter #{iteration}",
|
|
||||||
fontsize=12,
|
|
||||||
color="black",
|
|
||||||
verticalalignment="top",
|
|
||||||
bbox=dict(facecolor="orange", alpha=0.5)
|
|
||||||
)
|
|
||||||
|
|
||||||
plot.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
|
|
||||||
bbox[2] - bbox[0] + 2 * max_h_gap,
|
|
||||||
bbox[3] - bbox[1] + 2 * max_v_gap,
|
|
||||||
color="orange",
|
|
||||||
fill=False
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Hybrid(BaseParser):
|
class Hybrid(BaseParser):
|
||||||
"""Hybrid method of parsing looks for spaces between text
|
"""Hybrid method of parsing looks for spaces between text
|
||||||
|
|
@ -738,7 +701,7 @@ class Hybrid(BaseParser):
|
||||||
flag_size=False,
|
flag_size=False,
|
||||||
split_text=False,
|
split_text=False,
|
||||||
strip_text="",
|
strip_text="",
|
||||||
edge_tol=50,
|
edge_tol=None,
|
||||||
row_tol=2,
|
row_tol=2,
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
debug=False,
|
debug=False,
|
||||||
|
|
@ -754,6 +717,8 @@ class Hybrid(BaseParser):
|
||||||
debug=debug
|
debug=debug
|
||||||
)
|
)
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
|
self.textedges = None
|
||||||
|
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.edge_tol = edge_tol
|
self.edge_tol = edge_tol
|
||||||
self.row_tol = row_tol
|
self.row_tol = row_tol
|
||||||
|
|
@ -973,7 +938,11 @@ class Hybrid(BaseParser):
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox = table_bbox
|
||||||
return
|
return
|
||||||
|
|
||||||
all_textlines = self.horizontal_text + self.vertical_text
|
# Take all the textlines that are not just spaces
|
||||||
|
all_textlines = [
|
||||||
|
t for t in self.horizontal_text + self.vertical_text
|
||||||
|
if len(t.get_text().strip()) > 0
|
||||||
|
]
|
||||||
textlines = self._apply_regions_filter(all_textlines)
|
textlines = self._apply_regions_filter(all_textlines)
|
||||||
|
|
||||||
textlines_processed = {}
|
textlines_processed = {}
|
||||||
|
|
@ -996,8 +965,15 @@ class Hybrid(BaseParser):
|
||||||
debug_info_edges_searches.append(
|
debug_info_edges_searches.append(
|
||||||
copy.deepcopy(self.textedges)
|
copy.deepcopy(self.textedges)
|
||||||
)
|
)
|
||||||
|
gaps_hv = self.textedges._compute_plausible_gaps()
|
||||||
|
if gaps_hv is None:
|
||||||
|
return None
|
||||||
|
if self.edge_tol is not None:
|
||||||
|
# edge_tol instructions override the calculated vertical gap
|
||||||
|
gaps_hv = (gaps_hv[0], self.edge_tol)
|
||||||
bbox = self.textedges._build_bbox_candidate(
|
bbox = self.textedges._build_bbox_candidate(
|
||||||
debug_info_bboxes_searches
|
gaps_hv,
|
||||||
|
debug_info=debug_info_bboxes_searches
|
||||||
)
|
)
|
||||||
if bbox is None:
|
if bbox is None:
|
||||||
break
|
break
|
||||||
|
|
@ -1028,7 +1004,7 @@ class Hybrid(BaseParser):
|
||||||
bbox,
|
bbox,
|
||||||
textlines,
|
textlines,
|
||||||
cols_anchors,
|
cols_anchors,
|
||||||
average_tl_height
|
gaps_hv[1] # average_tl_height
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.debug_info is not None:
|
if self.debug_info is not None:
|
||||||
|
|
|
||||||
|
|
@ -11,20 +11,50 @@ else:
|
||||||
from .utils import bbox_from_str
|
from .utils import bbox_from_str
|
||||||
|
|
||||||
|
|
||||||
def draw_labeled_bbox(ax, bbox, text, rect_color):
|
def draw_labeled_bbox(
|
||||||
|
ax, bbox, text,
|
||||||
|
color="black", linewidth=3,
|
||||||
|
linestyle="solid",
|
||||||
|
label_pos="top,left"
|
||||||
|
):
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(bbox[0], bbox[1]),
|
(bbox[0], bbox[1]),
|
||||||
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
||||||
color="purple", linewidth=3,
|
color=color,
|
||||||
|
linewidth=linewidth, linestyle=linestyle,
|
||||||
fill=False
|
fill=False
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
vlabel, hlabel = label_pos.split(",")
|
||||||
|
if (vlabel == "top"):
|
||||||
|
y = max(bbox[1], bbox[3])
|
||||||
|
elif (vlabel == "bottom"):
|
||||||
|
y = min(bbox[1], bbox[3])
|
||||||
|
else:
|
||||||
|
y = 0.5 * (bbox[1] + bbox[3])
|
||||||
|
|
||||||
|
# We want to draw the label outside the box (above or below)
|
||||||
|
label_align_swap = {
|
||||||
|
"top": "bottom",
|
||||||
|
"bottom": "top",
|
||||||
|
"center": "center"
|
||||||
|
}
|
||||||
|
vlabel_out_of_box = label_align_swap[vlabel]
|
||||||
|
if (hlabel == "right"):
|
||||||
|
x = max(bbox[0], bbox[2])
|
||||||
|
elif (hlabel == "left"):
|
||||||
|
x = min(bbox[0], bbox[2])
|
||||||
|
else:
|
||||||
|
x = 0.5 * (bbox[0] + bbox[2])
|
||||||
ax.text(
|
ax.text(
|
||||||
bbox[0], bbox[1],
|
x, y,
|
||||||
text,
|
text,
|
||||||
fontsize=12, color="black", verticalalignment="top",
|
fontsize=12, color="black",
|
||||||
bbox=dict(facecolor="purple", alpha=0.5)
|
verticalalignment=vlabel_out_of_box,
|
||||||
|
horizontalalignment=hlabel,
|
||||||
|
bbox=dict(facecolor=color, alpha=0.3)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -46,21 +76,6 @@ def draw_pdf(table, ax, to_pdf_scale=True):
|
||||||
else:
|
else:
|
||||||
ax.imshow(img)
|
ax.imshow(img)
|
||||||
|
|
||||||
if table.debug_info:
|
|
||||||
# Display a bbox per region
|
|
||||||
for region_str in table.debug_info["table_regions"] or []:
|
|
||||||
draw_labeled_bbox(
|
|
||||||
ax, bbox_from_str(region_str),
|
|
||||||
"region: ({region_str})".format(region_str=region_str),
|
|
||||||
"purple"
|
|
||||||
)
|
|
||||||
# Display a bbox per area
|
|
||||||
for area_str in table.debug_info["table_areas"] or []:
|
|
||||||
draw_labeled_bbox(
|
|
||||||
ax, bbox_from_str(area_str),
|
|
||||||
"area: ({area_str})".format(area_str=area_str), "pink"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def draw_parse_constraints(table, ax):
|
def draw_parse_constraints(table, ax):
|
||||||
"""Draw any user provided constraints (area, region, columns, etc)
|
"""Draw any user provided constraints (area, region, columns, etc)
|
||||||
|
|
@ -78,13 +93,20 @@ def draw_parse_constraints(table, ax):
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, bbox_from_str(region_str),
|
ax, bbox_from_str(region_str),
|
||||||
"region: ({region_str})".format(region_str=region_str),
|
"region: ({region_str})".format(region_str=region_str),
|
||||||
"purple"
|
color="purple",
|
||||||
|
linestyle="dotted",
|
||||||
|
linewidth=1,
|
||||||
|
label_pos="bottom,right"
|
||||||
)
|
)
|
||||||
# Display a bbox per area
|
# Display a bbox per area
|
||||||
for area_str in table.debug_info["table_areas"] or []:
|
for area_str in table.debug_info["table_areas"] or []:
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, bbox_from_str(area_str),
|
ax, bbox_from_str(area_str),
|
||||||
"area: ({area_str})".format(area_str=area_str), "pink"
|
"area: ({area_str})".format(area_str=area_str),
|
||||||
|
color="pink",
|
||||||
|
linestyle="dotted",
|
||||||
|
linewidth=1,
|
||||||
|
label_pos="bottom,right"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -220,7 +242,9 @@ class PlotMethods(object):
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.add_patch(
|
ax.add_patch(
|
||||||
patches.Rectangle(
|
patches.Rectangle(
|
||||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
|
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||||
|
color="blue",
|
||||||
|
alpha=0.5
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -329,3 +353,79 @@ class PlotMethods(object):
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def hybrid_table_search(table):
|
||||||
|
"""Generates a plot illustrating the steps of the hybrid table search.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
|
"""
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
draw_pdf(table, ax)
|
||||||
|
draw_parse_constraints(table, ax)
|
||||||
|
|
||||||
|
if table.debug_info is None:
|
||||||
|
return fig
|
||||||
|
debug_info = table.debug_info
|
||||||
|
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
|
||||||
|
max_h_gap = bbox_search["max_h_gap"]
|
||||||
|
max_v_gap = bbox_search["max_v_gap"]
|
||||||
|
iterations = bbox_search["iterations"]
|
||||||
|
for iteration, bbox in enumerate(iterations):
|
||||||
|
final = iteration == len(iterations) - 1
|
||||||
|
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, bbox,
|
||||||
|
"box #{box_id} / iter #{iteration}".format(
|
||||||
|
box_id=box_id,
|
||||||
|
iteration=iteration
|
||||||
|
),
|
||||||
|
color="red",
|
||||||
|
linewidth=5 if final else 2,
|
||||||
|
label_pos="bottom,left"
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
|
||||||
|
bbox[2] - bbox[0] + 2 * max_h_gap,
|
||||||
|
bbox[3] - bbox[1] + 2 * max_v_gap,
|
||||||
|
color="orange",
|
||||||
|
fill=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for box_id, col_search in enumerate(debug_info["col_searches"]):
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, col_search["expanded_bbox"],
|
||||||
|
"box body + header #{box_id}".format(
|
||||||
|
box_id=box_id
|
||||||
|
),
|
||||||
|
color="red",
|
||||||
|
linewidth=4,
|
||||||
|
label_pos="top,left"
|
||||||
|
)
|
||||||
|
draw_labeled_bbox(
|
||||||
|
ax, col_search["core_bbox"],
|
||||||
|
"box body #{box_id}".format(
|
||||||
|
box_id=box_id
|
||||||
|
),
|
||||||
|
color="orange",
|
||||||
|
linewidth=2,
|
||||||
|
label_pos="bottom,left"
|
||||||
|
)
|
||||||
|
# self.debug_info["col_searches"].append({
|
||||||
|
# "core_bbox": bbox,
|
||||||
|
# "cols_anchors": cols_anchors,
|
||||||
|
# "expanded_bbox": expanded_bbox
|
||||||
|
# })
|
||||||
|
|
||||||
|
return fig
|
||||||
|
|
|
||||||
|
|
@ -1115,10 +1115,10 @@ def compare_tables(left, right):
|
||||||
differences_str = " and ".join(differences)
|
differences_str = " and ".join(differences)
|
||||||
print(
|
print(
|
||||||
"Right has {differences_str} than left "
|
"Right has {differences_str} than left "
|
||||||
"{shape_right} vs {shape_left}".format(
|
"{shape_left} vs {shape_right}".format(
|
||||||
differences_str=differences_str,
|
differences_str=differences_str,
|
||||||
|
shape_left=[left.shape[0], left.shape[1]],
|
||||||
shape_right=[right.shape[0], right.shape[1]],
|
shape_right=[right.shape[0], right.shape[1]],
|
||||||
shape_left=[left.shape[0], left.shape[1]]
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2442,6 +2442,10 @@ data_stream_edge_tol = [
|
||||||
["period.", ""],
|
["period.", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# The stream algorithm ends up including a footer, which hybrid correctly
|
||||||
|
# skips.
|
||||||
|
data_hybrid_edge_tol = data_stream_edge_tol[:-3]
|
||||||
|
|
||||||
data_lattice = [
|
data_lattice = [
|
||||||
[
|
[
|
||||||
"Cycle \nName",
|
"Cycle \nName",
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 105 KiB |
|
Before Width: | Height: | Size: 98 KiB After Width: | Height: | Size: 100 KiB |
|
Before Width: | Height: | Size: 98 KiB After Width: | Height: | Size: 100 KiB |
|
Before Width: | Height: | Size: 197 KiB After Width: | Height: | Size: 197 KiB |
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 103 KiB |
|
|
@ -7,6 +7,7 @@ from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
from camelot.core import Table, TableList
|
from camelot.core import Table, TableList
|
||||||
|
from camelot.utils import compare_tables
|
||||||
from camelot.__version__ import generate_version
|
from camelot.__version__ import generate_version
|
||||||
|
|
||||||
from .data import *
|
from .data import *
|
||||||
|
|
@ -193,7 +194,7 @@ def test_hybrid_table_regions():
|
||||||
# The "stream" test looks for a region in ["320,460,573,335"], which
|
# The "stream" test looks for a region in ["320,460,573,335"], which
|
||||||
# should exclude the header.
|
# should exclude the header.
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", table_regions=["320,505,573,330"]
|
filename, flavor="hybrid", table_regions=["320,335,573,505"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
@ -248,7 +249,7 @@ def test_hybrid_strip_text():
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_edge_tol():
|
def test_hybrid_edge_tol():
|
||||||
df = pd.DataFrame(data_stream_edge_tol)
|
df = pd.DataFrame(data_hybrid_edge_tol)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
|
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
|
||||||
|
|
|
||||||