From e0e3ff4e07137d8a8c3ac9aa364c4f8e768e45f5 Mon Sep 17 00:00:00 2001 From: Frh Date: Mon, 20 Apr 2020 11:20:59 -0700 Subject: [PATCH] Add support for region/area for hybrid --- camelot/parsers/base.py | 7 ++- camelot/parsers/hybrid.py | 121 ++++++++++++++++++++++++++++++++++---- camelot/parsers/stream.py | 21 ++----- camelot/utils.py | 28 +++++++++ tests/data.py | 8 +++ tests/test_common.py | 8 ++- 6 files changed, 164 insertions(+), 29 deletions(-) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 921a118..7aa35ad 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -23,6 +23,7 @@ class BaseParser(object): strip_text="", shift_text=None, flag_size=False, + debug=False ): self.id = parser_id self.table_regions = table_regions @@ -39,7 +40,7 @@ class BaseParser(object): self.t_bbox = None # For plotting details of parsing algorithms - self.debug_info = {} + self.debug_info = {} if debug else None def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs): @@ -60,6 +61,10 @@ class BaseParser(object): self.pdf_width, self.pdf_height = self.dimensions self.rootname, __ = os.path.splitext(self.filename) + if self.debug_info is not None: + self.debug_info["table_regions"] = self.table_regions + self.debug_info["table_areas"] = self.table_areas + def _document_has_no_text(self): if not self.horizontal_text: rootname = os.path.basename(self.rootname) diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 0efc053..3964624 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -7,6 +7,7 @@ import warnings from .base import BaseParser from ..utils import ( + bbox_from_str, text_in_bbox, text_in_bbox_per_axis, bbox_from_text, @@ -21,6 +22,23 @@ from matplotlib import patches as patches MAX_COL_SPREAD_IN_HEADER = 3 +def plot_annotated_bbox(plot, bbox, text, rect_color): + plot.add_patch( + patches.Rectangle( + (bbox[0], bbox[1]), + bbox[2] - bbox[0], bbox[3] - bbox[1], + color="purple", linewidth=3, + fill=False + ) + ) + plot.text( + bbox[0], bbox[1], + text, + fontsize=12, color="black", verticalalignment="top", + bbox=dict(facecolor="purple", alpha=0.5) + ) + + def todo_move_me_expand_area_for_header(area, textlines, col_anchors, average_row_height): """The core algorithm is based on fairly strict alignment of text. @@ -273,6 +291,7 @@ class TextEdges2(object): "center": (textline.y0 + textline.y1) / 2.0, } + # FRHTODO: Move to utils and use generic name @staticmethod def _get_index_closest_point(coord, edge_array): """Returns the index of the closest point @@ -481,12 +500,63 @@ class TextEdges2(object): default=None ) + # First, determine the textline that has the most combined alignments + # across horizontal and vertical axis. + # It will serve both as a starting point for the table boundary search, + # and as a way to estimate the average spacing between rows/cols. + most_aligned_tl = get_best_textline(tls_search_space) + most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl) + + # Retrieve the list of textlines it's aligned with, across both axis + best_alignment = self._textlines_alignments[most_aligned_tl] + ref_h_edge_name = best_alignment.max_h_edge_name() + ref_v_edge_name = best_alignment.max_v_edge_name() + best_h_textedges = self._textedges[ref_h_edge_name] + best_v_textedges = self._textedges[ref_v_edge_name] + h_coord = most_aligned_coords[ref_h_edge_name] + v_coord = most_aligned_coords[ref_v_edge_name] + h_textlines = sorted( + best_h_textedges[ + TextEdges2._get_index_closest_point( + h_coord, + best_h_textedges + ) + ].textlines, + key=lambda tl: tl.x0, + reverse=True + ) + v_textlines = sorted( + best_v_textedges[ + TextEdges2._get_index_closest_point( + v_coord, + best_v_textedges + ) + ].textlines, + key=lambda tl: tl.y0, + reverse=True + ) + + h_gaps, v_gaps = [], [] + for i in range(1, len(v_textlines)): + v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0) + for i in range(1, len(h_textlines)): + h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0) + + if (not h_gaps or not v_gaps): + return None + percentile = 75 + gaps_hv = ( + np.percentile(h_gaps, percentile), + np.percentile(v_gaps, percentile) + ) + # Calculate the 75th percentile of the horizontal/vertical # gaps between textlines. Use this as a reference for a threshold # to not exceed while looking for table boundaries. - gaps_hv = self._calculate_gaps_thresholds(75) - if (gaps_hv[0] is None or gaps_hv[1] is None): - return None + # FRHTODO: Clean this up + # gaps_hv = self._calculate_gaps_thresholds(75) + # if (gaps_hv[0] is None or gaps_hv[1] is None): + # return None max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3 if debug_info is not None: @@ -501,11 +571,10 @@ class TextEdges2(object): debug_info_search = None MINIMUM_TEXTLINES_IN_TABLE = 6 - tl_most_aligned = get_best_textline(tls_search_space) - bbox = (tl_most_aligned.x0, tl_most_aligned.y0, - tl_most_aligned.x1, tl_most_aligned.y1) - tls_search_space.remove(tl_most_aligned) - tls_in_bbox = [tl_most_aligned] + bbox = (most_aligned_tl.x0, most_aligned_tl.y0, + most_aligned_tl.x1, most_aligned_tl.y1) + tls_search_space.remove(most_aligned_tl) + tls_in_bbox = [most_aligned_tl] last_bbox = None while last_bbox != bbox: if debug_info_search is not None: @@ -581,6 +650,19 @@ class TextEdges2(object): def plotFRHTableSearch(self, plot, debug_info): if debug_info is None: return + # Display a bbox per region + for region_str in debug_info["table_regions"] or []: + plot_annotated_bbox( + plot, bbox_from_str(region_str), + "region: ({region_str})".format(region_str=region_str), + "purple" + ) + # Display a bbox per area + for area_str in debug_info["table_areas"] or []: + plot_annotated_bbox( + plot, bbox_from_str(area_str), + "area: ({area_str})".format(area_str=area_str), "pink" + ) for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]): max_h_gap = bbox_search["max_h_gap"] max_v_gap = bbox_search["max_v_gap"] @@ -891,7 +973,26 @@ class Hybrid(BaseParser): # FRHTODO: get debug_info to work again def _generate_table_bbox(self, debug_info=None): - textlines = self.horizontal_text + self.vertical_text + if self.table_areas is not None: + table_bbox = {} + for area_str in self.table_areas: + table_bbox[bbox_from_str(area_str)] = None + self.table_bbox = table_bbox + return + + all_textlines = self.horizontal_text + self.vertical_text + textlines = [] + if self.table_regions is None: + textlines = all_textlines + else: + # filter text + for region_str in self.table_regions: + region_text = text_in_bbox( + bbox_from_str(region_str), + all_textlines + ) + textlines.extend(region_text) + textlines_processed = {} self.table_bbox = {} if debug_info is not None: @@ -1053,7 +1154,7 @@ class Hybrid(BaseParser): # Identify plausible areas within the doc where tables lie, # populate table_bbox keys with these areas. - self._generate_table_bbox() + self._generate_table_bbox(debug_info) _tables = [] # sort tables based on y-coord diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 536195e..eb3479c 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -7,7 +7,7 @@ import numpy as np from .base import BaseParser from ..core import TextEdges -from ..utils import (text_in_bbox, text_in_bbox_per_axis) +from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis) class Stream(BaseParser): @@ -307,26 +307,17 @@ class Stream(BaseParser): if self.table_regions is not None: # filter horizontal text hor_text = [] - for region in self.table_regions: - x1, y1, x2, y2 = region.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) + for region_str in self.table_regions: region_text = text_in_bbox( - (x1, y2, x2, y1), self.horizontal_text) + bbox_from_str(region_str), + self.horizontal_text) hor_text.extend(region_text) # find tables based on nurminen's detection algorithm table_bbox = self._nurminen_table_detection(hor_text) else: table_bbox = {} - for area in self.table_areas: - x1, y1, x2, y2 = area.split(",") - x1 = float(x1) - y1 = float(y1) - x2 = float(x2) - y2 = float(y2) - table_bbox[(x1, y2, x2, y1)] = None + for area_str in self.table_areas: + table_bbox[bbox_from_str(area_str)] = None self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk): diff --git a/camelot/utils.py b/camelot/utils.py index 7e789b2..883040b 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -389,6 +389,34 @@ def segments_in_bbox(bbox, v_segments, h_segments): return v_s, h_s +def bbox_from_str(bbox_str): + """Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2) + + Parameters + ---------- + bbox_str : str + Serialized bbox with comma separated coordinates, "x1,y1,x2,y2". + + Returns + ------- + bbox : tuple + Tuple (x1, y1, x2, y2). + + """ + x1, y1, x2, y2 = bbox_str.split(",") + x1 = float(x1) + y1 = float(y1) + x2 = float(x2) + y2 = float(y2) + # FRHTODO: do things still work if I do x1, y1, x2, y2? + return ( + min(x1, x2), + min(y1, y2), + max(x1, x2), + max(y1, y2) + ) + + def text_in_bbox(bbox, text): """Returns all text objects present inside a bounding box. diff --git a/tests/data.py b/tests/data.py index 5b66d33..a1ec2e7 100755 --- a/tests/data.py +++ b/tests/data.py @@ -1297,6 +1297,10 @@ data_stream_two_tables_1 = [ ], ] +# The streaming algorithm incorrectly includes a header and a footer. +# Trimming the table for the test of hybrid, which doesn't include it. +data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1] + data_stream_two_tables_2 = [ ["Table 325. Arrests by Race: 2009", "", "", "", "", ""], [ @@ -1605,6 +1609,10 @@ data_stream_two_tables_2 = [ ["1 Except forcible rape and prostitution.", "", "", "", "", ""], ] +# The streaming algorithm incorrectly includes a header and a footer. +# Trimming the table for the test of hybrid, which doesn't include it. +data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1] + data_stream_table_areas = [ ["", "One Withholding"], ["Payroll Period", "Allowance"], diff --git a/tests/test_common.py b/tests/test_common.py index ceb71a5..38cf92a 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -175,8 +175,8 @@ def test_hybrid_table_rotated(): def test_hybrid_two_tables(): - df1 = pd.DataFrame(data_stream_two_tables_1) - df2 = pd.DataFrame(data_stream_two_tables_2) + df1 = pd.DataFrame(data_hybrid_two_tables_1) + df2 = pd.DataFrame(data_hybrid_two_tables_2) filename = os.path.join(testdir, "tabula/12s0324.pdf") tables = camelot.read_pdf(filename, flavor="hybrid") @@ -190,8 +190,10 @@ def test_hybrid_table_regions(): df = pd.DataFrame(data_stream_table_areas) filename = os.path.join(testdir, "tabula/us-007.pdf") + # The "stream" test looks for a region in ["320,460,573,335"], which + # should exclude the header. tables = camelot.read_pdf( - filename, flavor="hybrid", table_regions=["320,460,573,335"] + filename, flavor="hybrid", table_regions=["320,505,573,330"] ) assert_frame_equal(df, tables[0].df)