diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 9877867..b1629c2 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -22,25 +22,8 @@ from matplotlib import patches as patches MAX_COL_SPREAD_IN_HEADER = 3 -def plot_annotated_bbox(plot, bbox, text, rect_color): - plot.add_patch( - patches.Rectangle( - (bbox[0], bbox[1]), - bbox[2] - bbox[0], bbox[3] - bbox[1], - color="purple", linewidth=3, - fill=False - ) - ) - plot.text( - bbox[0], bbox[1], - text, - fontsize=12, color="black", verticalalignment="top", - bbox=dict(facecolor="purple", alpha=0.5) - ) - - def todo_move_me_expand_area_for_header(area, textlines, col_anchors, - average_row_height): + max_v_gap): """The core algorithm is based on fairly strict alignment of text. It works ok for the table body, but might fail on tables' headers since they tend to be in a different font, alignment (e.g. vertical), @@ -78,13 +61,13 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors, all_above = [] for te in textlines: # higher than the table, directly within its bounds - if te.y0 > top and te.x0 > left and te.x1 < right: + if te.y0 > top and te.x0 >= left and te.x1 <= right: all_above.append(te) if closest_above is None or closest_above.y0 > te.y0: closest_above = te if closest_above and \ - closest_above.y0 < top + average_row_height: + closest_above.y0 < top + max_v_gap: # b/ We have a candidate cell that is within the correct # vertical band, and directly above the table. Starting from # this anchor, we list all the textlines within the same row. @@ -475,37 +458,42 @@ class TextEdges2(object): self._textlines_alignments = {} self._compute_alignment_counts() - def _build_bbox_candidate(self, debug_info=None): - """ Seed the process with the textline with the highest alignment - score, then expand the bbox with textlines within threshold. + def _most_connected_textline(self): + """ Retrieve the textline that is most connected across vertical and + horizontal axis. + + """ + # Find the textline with the highest alignment score + return max( + self._textlines_alignments.keys(), + key=lambda textline: + self._textlines_alignments[textline].alignment_score(), + default=None + ) + + def _compute_plausible_gaps(self): + """ Evaluate plausible gaps between cells horizontally and vertically + based on the textlines aligned with the most connected textline. + + Returns + ------- + gaps_hv : tuple + (horizontal_gap, horizontal_gap) in pdf coordinate space. - Parameters - ---------- - debug_info : array - Optional parameter array, in which to store extra information - to help later visualization of the table creation. """ if self.max_rows <= 1 or self.max_cols <= 1: return None - tls_search_space = list(self._textlines_alignments.keys()) - def get_best_textline(textlines): - # Find the textline with the highest alignment score - return max( - textlines, - key=lambda textline: - self._textlines_alignments[textline].alignment_score(), - default=None - ) + # Determine the textline that has the most combined + # alignments across horizontal and vertical axis. + # It will serve as a reference axis along which to collect the average + # spacing between rows/cols. + most_aligned_tl = self._most_connected_textline() + most_aligned_coords = TextEdges2.get_textline_coords( + most_aligned_tl) - # First, determine the textline that has the most combined alignments - # across horizontal and vertical axis. - # It will serve both as a starting point for the table boundary search, - # and as a way to estimate the average spacing between rows/cols. - most_aligned_tl = get_best_textline(tls_search_space) - most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl) - - # Retrieve the list of textlines it's aligned with, across both axis + # Retrieve the list of textlines it's aligned with, across both + # axis best_alignment = self._textlines_alignments[most_aligned_tl] ref_h_edge_name = best_alignment.max_h_edge_name() ref_v_edge_name = best_alignment.max_v_edge_name() @@ -544,9 +532,30 @@ class TextEdges2(object): return None percentile = 75 gaps_hv = ( - np.percentile(h_gaps, percentile), - np.percentile(v_gaps, percentile) + 2.0 * np.percentile(h_gaps, percentile), + 2.0 * np.percentile(v_gaps, percentile) ) + return gaps_hv + + def _build_bbox_candidate(self, gaps_hv, debug_info=None): + """ Seed the process with the textline with the highest alignment + score, then expand the bbox with textlines within threshold. + + Parameters + ---------- + gaps_hv : tuple + The maximum distance allowed to consider surrounding lines/columns + as part of the same table. + debug_info : array (optional) + Optional parameter array, in which to store extra information + to help later visualization of the table creation. + """ + # First, determine the textline that has the most combined + # alignments across horizontal and vertical axis. + # It will serve both as a starting point for the table boundary + # search, and as a way to estimate the average spacing between + # rows/cols. + most_aligned_tl = self._most_connected_textline() # Calculate the 75th percentile of the horizontal/vertical # gaps between textlines. Use this as a reference for a threshold @@ -555,7 +564,7 @@ class TextEdges2(object): # gaps_hv = self._calculate_gaps_thresholds(75) # if (gaps_hv[0] is None or gaps_hv[1] is None): # return None - max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3 + max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1] if debug_info is not None: # Store debug info @@ -571,6 +580,11 @@ class TextEdges2(object): MINIMUM_TEXTLINES_IN_TABLE = 6 bbox = (most_aligned_tl.x0, most_aligned_tl.y0, most_aligned_tl.x1, most_aligned_tl.y1) + + # For the body of the table, we only consider cells with alignments + # on both axis. + tls_search_space = list(self._textlines_alignments.keys()) + # tls_search_space = [] tls_search_space.remove(most_aligned_tl) tls_in_bbox = [most_aligned_tl] last_bbox = None @@ -639,57 +653,6 @@ class TextEdges2(object): color="black" ) - def plotFRHTableSearch(self, plot, debug_info): - if debug_info is None: - return - # Display a bbox per region - for region_str in debug_info["table_regions"] or []: - plot_annotated_bbox( - plot, bbox_from_str(region_str), - "region: ({region_str})".format(region_str=region_str), - "purple" - ) - # Display a bbox per area - for area_str in debug_info["table_areas"] or []: - plot_annotated_bbox( - plot, bbox_from_str(area_str), - "area: ({area_str})".format(area_str=area_str), "pink" - ) - for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]): - max_h_gap = bbox_search["max_h_gap"] - max_v_gap = bbox_search["max_v_gap"] - iterations = bbox_search["iterations"] - for iteration, bbox in enumerate(iterations): - final = iteration == len(iterations) - 1 - plot.add_patch( - patches.Rectangle( - (bbox[0], bbox[1]), - bbox[2] - bbox[0], bbox[3] - bbox[1], - color="red", - linewidth=5 if final else 2, - fill=False - ) - ) - plot.text( - bbox[0], - bbox[1], - f"box #{box_id+1} / iter #{iteration}", - fontsize=12, - color="black", - verticalalignment="top", - bbox=dict(facecolor="orange", alpha=0.5) - ) - - plot.add_patch( - patches.Rectangle( - (bbox[0]-max_h_gap, bbox[1]-max_v_gap), - bbox[2] - bbox[0] + 2 * max_h_gap, - bbox[3] - bbox[1] + 2 * max_v_gap, - color="orange", - fill=False - ) - ) - class Hybrid(BaseParser): """Hybrid method of parsing looks for spaces between text @@ -738,7 +701,7 @@ class Hybrid(BaseParser): flag_size=False, split_text=False, strip_text="", - edge_tol=50, + edge_tol=None, row_tol=2, column_tol=0, debug=False, @@ -754,6 +717,8 @@ class Hybrid(BaseParser): debug=debug ) self.columns = columns + self.textedges = None + self._validate_columns() self.edge_tol = edge_tol self.row_tol = row_tol @@ -973,7 +938,11 @@ class Hybrid(BaseParser): self.table_bbox = table_bbox return - all_textlines = self.horizontal_text + self.vertical_text + # Take all the textlines that are not just spaces + all_textlines = [ + t for t in self.horizontal_text + self.vertical_text + if len(t.get_text().strip()) > 0 + ] textlines = self._apply_regions_filter(all_textlines) textlines_processed = {} @@ -996,8 +965,15 @@ class Hybrid(BaseParser): debug_info_edges_searches.append( copy.deepcopy(self.textedges) ) + gaps_hv = self.textedges._compute_plausible_gaps() + if gaps_hv is None: + return None + if self.edge_tol is not None: + # edge_tol instructions override the calculated vertical gap + gaps_hv = (gaps_hv[0], self.edge_tol) bbox = self.textedges._build_bbox_candidate( - debug_info_bboxes_searches + gaps_hv, + debug_info=debug_info_bboxes_searches ) if bbox is None: break @@ -1028,7 +1004,7 @@ class Hybrid(BaseParser): bbox, textlines, cols_anchors, - average_tl_height + gaps_hv[1] # average_tl_height ) if self.debug_info is not None: diff --git a/camelot/plotting.py b/camelot/plotting.py index a7f249f..9a7f552 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -11,20 +11,50 @@ else: from .utils import bbox_from_str -def draw_labeled_bbox(ax, bbox, text, rect_color): +def draw_labeled_bbox( + ax, bbox, text, + color="black", linewidth=3, + linestyle="solid", + label_pos="top,left" +): ax.add_patch( patches.Rectangle( (bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], - color="purple", linewidth=3, + color=color, + linewidth=linewidth, linestyle=linestyle, fill=False ) ) + + vlabel, hlabel = label_pos.split(",") + if (vlabel == "top"): + y = max(bbox[1], bbox[3]) + elif (vlabel == "bottom"): + y = min(bbox[1], bbox[3]) + else: + y = 0.5 * (bbox[1] + bbox[3]) + + # We want to draw the label outside the box (above or below) + label_align_swap = { + "top": "bottom", + "bottom": "top", + "center": "center" + } + vlabel_out_of_box = label_align_swap[vlabel] + if (hlabel == "right"): + x = max(bbox[0], bbox[2]) + elif (hlabel == "left"): + x = min(bbox[0], bbox[2]) + else: + x = 0.5 * (bbox[0] + bbox[2]) ax.text( - bbox[0], bbox[1], + x, y, text, - fontsize=12, color="black", verticalalignment="top", - bbox=dict(facecolor="purple", alpha=0.5) + fontsize=12, color="black", + verticalalignment=vlabel_out_of_box, + horizontalalignment=hlabel, + bbox=dict(facecolor=color, alpha=0.3) ) @@ -46,21 +76,6 @@ def draw_pdf(table, ax, to_pdf_scale=True): else: ax.imshow(img) - if table.debug_info: - # Display a bbox per region - for region_str in table.debug_info["table_regions"] or []: - draw_labeled_bbox( - ax, bbox_from_str(region_str), - "region: ({region_str})".format(region_str=region_str), - "purple" - ) - # Display a bbox per area - for area_str in table.debug_info["table_areas"] or []: - draw_labeled_bbox( - ax, bbox_from_str(area_str), - "area: ({area_str})".format(area_str=area_str), "pink" - ) - def draw_parse_constraints(table, ax): """Draw any user provided constraints (area, region, columns, etc) @@ -78,13 +93,20 @@ def draw_parse_constraints(table, ax): draw_labeled_bbox( ax, bbox_from_str(region_str), "region: ({region_str})".format(region_str=region_str), - "purple" + color="purple", + linestyle="dotted", + linewidth=1, + label_pos="bottom,right" ) # Display a bbox per area for area_str in table.debug_info["table_areas"] or []: draw_labeled_bbox( ax, bbox_from_str(area_str), - "area: ({area_str})".format(area_str=area_str), "pink" + "area: ({area_str})".format(area_str=area_str), + color="pink", + linestyle="dotted", + linewidth=1, + label_pos="bottom,right" ) @@ -220,7 +242,9 @@ class PlotMethods(object): ys.extend([t[1], t[3]]) ax.add_patch( patches.Rectangle( - (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue" + (t[0], t[1]), t[2] - t[0], t[3] - t[1], + color="blue", + alpha=0.5 ) ) @@ -329,3 +353,79 @@ class PlotMethods(object): for h in horizontal: ax.plot([h[0], h[2]], [h[1], h[3]]) return fig + + @staticmethod + def hybrid_table_search(table): + """Generates a plot illustrating the steps of the hybrid table search. + + Parameters + ---------- + table : camelot.core.Table + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect="equal") + draw_pdf(table, ax) + draw_parse_constraints(table, ax) + + if table.debug_info is None: + return fig + debug_info = table.debug_info + for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]): + max_h_gap = bbox_search["max_h_gap"] + max_v_gap = bbox_search["max_v_gap"] + iterations = bbox_search["iterations"] + for iteration, bbox in enumerate(iterations): + final = iteration == len(iterations) - 1 + + draw_labeled_bbox( + ax, bbox, + "box #{box_id} / iter #{iteration}".format( + box_id=box_id, + iteration=iteration + ), + color="red", + linewidth=5 if final else 2, + label_pos="bottom,left" + ) + + ax.add_patch( + patches.Rectangle( + (bbox[0]-max_h_gap, bbox[1]-max_v_gap), + bbox[2] - bbox[0] + 2 * max_h_gap, + bbox[3] - bbox[1] + 2 * max_v_gap, + color="orange", + fill=False + ) + ) + + for box_id, col_search in enumerate(debug_info["col_searches"]): + draw_labeled_bbox( + ax, col_search["expanded_bbox"], + "box body + header #{box_id}".format( + box_id=box_id + ), + color="red", + linewidth=4, + label_pos="top,left" + ) + draw_labeled_bbox( + ax, col_search["core_bbox"], + "box body #{box_id}".format( + box_id=box_id + ), + color="orange", + linewidth=2, + label_pos="bottom,left" + ) + # self.debug_info["col_searches"].append({ + # "core_bbox": bbox, + # "cols_anchors": cols_anchors, + # "expanded_bbox": expanded_bbox + # }) + + return fig diff --git a/camelot/utils.py b/camelot/utils.py index 883040b..8672f3a 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1115,10 +1115,10 @@ def compare_tables(left, right): differences_str = " and ".join(differences) print( "Right has {differences_str} than left " - "{shape_right} vs {shape_left}".format( + "{shape_left} vs {shape_right}".format( differences_str=differences_str, + shape_left=[left.shape[0], left.shape[1]], shape_right=[right.shape[0], right.shape[1]], - shape_left=[left.shape[0], left.shape[1]] ) ) diff --git a/tests/data.py b/tests/data.py index a1ec2e7..207686f 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2442,6 +2442,10 @@ data_stream_edge_tol = [ ["period.", ""], ] +# The stream algorithm ends up including a footer, which hybrid correctly +# skips. +data_hybrid_edge_tol = data_stream_edge_tol[:-3] + data_lattice = [ [ "Cycle \nName", diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png index 2757c33..121147e 100644 Binary files a/tests/files/baseline_plots/test_hybrid_contour_plot.png and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png index 90874ff..5e67f83 100644 Binary files a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png and b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png index cac8334..fffd520 100644 Binary files a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png index 47e3c52..4f858e5 100644 Binary files a/tests/files/baseline_plots/test_hybrid_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_contour_plot.png b/tests/files/baseline_plots/test_stream_contour_plot.png index d781439..9cc7f36 100644 Binary files a/tests/files/baseline_plots/test_stream_contour_plot.png and b/tests/files/baseline_plots/test_stream_contour_plot.png differ diff --git a/tests/test_common.py b/tests/test_common.py index 38cf92a..53d3f44 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -7,6 +7,7 @@ from pandas.testing import assert_frame_equal import camelot from camelot.core import Table, TableList +from camelot.utils import compare_tables from camelot.__version__ import generate_version from .data import * @@ -193,7 +194,7 @@ def test_hybrid_table_regions(): # The "stream" test looks for a region in ["320,460,573,335"], which # should exclude the header. tables = camelot.read_pdf( - filename, flavor="hybrid", table_regions=["320,505,573,330"] + filename, flavor="hybrid", table_regions=["320,335,573,505"] ) assert_frame_equal(df, tables[0].df) @@ -248,7 +249,7 @@ def test_hybrid_strip_text(): def test_hybrid_edge_tol(): - df = pd.DataFrame(data_stream_edge_tol) + df = pd.DataFrame(data_hybrid_edge_tol) filename = os.path.join(testdir, "edge_tol.pdf") tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)