From e0e3ff4e07137d8a8c3ac9aa364c4f8e768e45f5 Mon Sep 17 00:00:00 2001
From: Frh <francois.huet+github@gmail.com>
Date: Mon, 20 Apr 2020 11:20:59 -0700
Subject: [PATCH] Add support for region/area for hybrid

---
 camelot/parsers/base.py   |   7 ++-
 camelot/parsers/hybrid.py | 121 ++++++++++++++++++++++++++++++++++----
 camelot/parsers/stream.py |  21 ++-----
 camelot/utils.py          |  28 +++++++++
 tests/data.py             |   8 +++
 tests/test_common.py      |   8 ++-
 6 files changed, 164 insertions(+), 29 deletions(-)

diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index 921a118..7aa35ad 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -23,6 +23,7 @@ class BaseParser(object):
         strip_text="",
         shift_text=None,
         flag_size=False,
+        debug=False
     ):
         self.id = parser_id
         self.table_regions = table_regions
@@ -39,7 +40,7 @@ class BaseParser(object):
         self.t_bbox = None
 
         # For plotting details of parsing algorithms
-        self.debug_info = {}
+        self.debug_info = {} if debug else None
 
     def prepare_page_parse(self, filename, layout, dimensions,
                            page_idx, layout_kwargs):
@@ -60,6 +61,10 @@ class BaseParser(object):
         self.pdf_width, self.pdf_height = self.dimensions
         self.rootname, __ = os.path.splitext(self.filename)
 
+        if self.debug_info is not None:
+            self.debug_info["table_regions"] = self.table_regions
+            self.debug_info["table_areas"] = self.table_areas
+
     def _document_has_no_text(self):
         if not self.horizontal_text:
             rootname = os.path.basename(self.rootname)
diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py
index 0efc053..3964624 100644
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@@ -7,6 +7,7 @@ import warnings
 
 from .base import BaseParser
 from ..utils import (
+    bbox_from_str,
     text_in_bbox,
     text_in_bbox_per_axis,
     bbox_from_text,
@@ -21,6 +22,23 @@ from matplotlib import patches as patches
 MAX_COL_SPREAD_IN_HEADER = 3
 
 
+def plot_annotated_bbox(plot, bbox, text, rect_color):
+    plot.add_patch(
+        patches.Rectangle(
+            (bbox[0], bbox[1]),
+            bbox[2] - bbox[0], bbox[3] - bbox[1],
+            color="purple", linewidth=3,
+            fill=False
+        )
+    )
+    plot.text(
+        bbox[0], bbox[1],
+        text,
+        fontsize=12, color="black", verticalalignment="top",
+        bbox=dict(facecolor="purple", alpha=0.5)
+    )
+
+
 def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
                                         average_row_height):
     """The core algorithm is based on fairly strict alignment of text.
@@ -273,6 +291,7 @@ class TextEdges2(object):
             "center": (textline.y0 + textline.y1) / 2.0,
         }
 
+    # FRHTODO: Move to utils and use generic name
     @staticmethod
     def _get_index_closest_point(coord, edge_array):
         """Returns the index of the closest point
@@ -481,12 +500,63 @@ class TextEdges2(object):
                 default=None
             )
 
+        # First, determine the textline that has the most combined alignments
+        # across horizontal and vertical axis.
+        # It will serve both as a starting point for the table boundary search,
+        # and as a way to estimate the average spacing between rows/cols.
+        most_aligned_tl = get_best_textline(tls_search_space)
+        most_aligned_coords = TextEdges2.get_textline_coords(most_aligned_tl)
+
+        # Retrieve the list of textlines it's aligned with, across both axis
+        best_alignment = self._textlines_alignments[most_aligned_tl]
+        ref_h_edge_name = best_alignment.max_h_edge_name()
+        ref_v_edge_name = best_alignment.max_v_edge_name()
+        best_h_textedges = self._textedges[ref_h_edge_name]
+        best_v_textedges = self._textedges[ref_v_edge_name]
+        h_coord = most_aligned_coords[ref_h_edge_name]
+        v_coord = most_aligned_coords[ref_v_edge_name]
+        h_textlines = sorted(
+            best_h_textedges[
+                TextEdges2._get_index_closest_point(
+                    h_coord,
+                    best_h_textedges
+                )
+            ].textlines,
+            key=lambda tl: tl.x0,
+            reverse=True
+        )
+        v_textlines = sorted(
+            best_v_textedges[
+                TextEdges2._get_index_closest_point(
+                    v_coord,
+                    best_v_textedges
+                )
+            ].textlines,
+            key=lambda tl: tl.y0,
+            reverse=True
+        )
+
+        h_gaps, v_gaps = [], []
+        for i in range(1, len(v_textlines)):
+            v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0)
+        for i in range(1, len(h_textlines)):
+            h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0)
+
+        if (not h_gaps or not v_gaps):
+            return None
+        percentile = 75
+        gaps_hv = (
+            np.percentile(h_gaps, percentile),
+            np.percentile(v_gaps, percentile)
+        )
+
         # Calculate the 75th percentile of the horizontal/vertical
         # gaps between textlines.  Use this as a reference for a threshold
         # to not exceed while looking for table boundaries.
-        gaps_hv = self._calculate_gaps_thresholds(75)
-        if (gaps_hv[0] is None or gaps_hv[1] is None):
-            return None
+        # FRHTODO: Clean this up
+        # gaps_hv = self._calculate_gaps_thresholds(75)
+        # if (gaps_hv[0] is None or gaps_hv[1] is None):
+        #    return None
         max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3
 
         if debug_info is not None:
@@ -501,11 +571,10 @@ class TextEdges2(object):
             debug_info_search = None
 
         MINIMUM_TEXTLINES_IN_TABLE = 6
-        tl_most_aligned = get_best_textline(tls_search_space)
-        bbox = (tl_most_aligned.x0, tl_most_aligned.y0,
-                tl_most_aligned.x1, tl_most_aligned.y1)
-        tls_search_space.remove(tl_most_aligned)
-        tls_in_bbox = [tl_most_aligned]
+        bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
+                most_aligned_tl.x1, most_aligned_tl.y1)
+        tls_search_space.remove(most_aligned_tl)
+        tls_in_bbox = [most_aligned_tl]
         last_bbox = None
         while last_bbox != bbox:
             if debug_info_search is not None:
@@ -581,6 +650,19 @@ class TextEdges2(object):
     def plotFRHTableSearch(self, plot, debug_info):
         if debug_info is None:
             return
+        # Display a bbox per region
+        for region_str in debug_info["table_regions"] or []:
+            plot_annotated_bbox(
+                plot, bbox_from_str(region_str),
+                "region: ({region_str})".format(region_str=region_str),
+                "purple"
+            )
+        # Display a bbox per area
+        for area_str in debug_info["table_areas"] or []:
+            plot_annotated_bbox(
+                plot, bbox_from_str(area_str),
+                "area: ({area_str})".format(area_str=area_str), "pink"
+            )
         for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
             max_h_gap = bbox_search["max_h_gap"]
             max_v_gap = bbox_search["max_v_gap"]
@@ -891,7 +973,26 @@ class Hybrid(BaseParser):
 
     # FRHTODO: get debug_info to work again
     def _generate_table_bbox(self, debug_info=None):
-        textlines = self.horizontal_text + self.vertical_text
+        if self.table_areas is not None:
+            table_bbox = {}
+            for area_str in self.table_areas:
+                table_bbox[bbox_from_str(area_str)] = None
+            self.table_bbox = table_bbox
+            return
+
+        all_textlines = self.horizontal_text + self.vertical_text
+        textlines = []
+        if self.table_regions is None:
+            textlines = all_textlines
+        else:
+            # filter text
+            for region_str in self.table_regions:
+                region_text = text_in_bbox(
+                    bbox_from_str(region_str),
+                    all_textlines
+                )
+                textlines.extend(region_text)
+
         textlines_processed = {}
         self.table_bbox = {}
         if debug_info is not None:
@@ -1053,7 +1154,7 @@ class Hybrid(BaseParser):
 
         # Identify plausible areas within the doc where tables lie,
         # populate table_bbox keys with these areas.
-        self._generate_table_bbox()
+        self._generate_table_bbox(debug_info)
 
         _tables = []
         # sort tables based on y-coord
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 536195e..eb3479c 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -7,7 +7,7 @@ import numpy as np
 
 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox, text_in_bbox_per_axis)
+from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
 
 
 class Stream(BaseParser):
@@ -307,26 +307,17 @@ class Stream(BaseParser):
             if self.table_regions is not None:
                 # filter horizontal text
                 hor_text = []
-                for region in self.table_regions:
-                    x1, y1, x2, y2 = region.split(",")
-                    x1 = float(x1)
-                    y1 = float(y1)
-                    x2 = float(x2)
-                    y2 = float(y2)
+                for region_str in self.table_regions:
                     region_text = text_in_bbox(
-                        (x1, y2, x2, y1), self.horizontal_text)
+                        bbox_from_str(region_str),
+                        self.horizontal_text)
                     hor_text.extend(region_text)
             # find tables based on nurminen's detection algorithm
             table_bbox = self._nurminen_table_detection(hor_text)
         else:
             table_bbox = {}
-            for area in self.table_areas:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = float(x1)
-                y1 = float(y1)
-                x2 = float(x2)
-                y2 = float(y2)
-                table_bbox[(x1, y2, x2, y1)] = None
+            for area_str in self.table_areas:
+                table_bbox[bbox_from_str(area_str)] = None
         self.table_bbox = table_bbox
 
     def _generate_columns_and_rows(self, table_idx, tk):
diff --git a/camelot/utils.py b/camelot/utils.py
index 7e789b2..883040b 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -389,6 +389,34 @@ def segments_in_bbox(bbox, v_segments, h_segments):
     return v_s, h_s
 
 
+def bbox_from_str(bbox_str):
+    """Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2)
+
+    Parameters
+    ----------
+    bbox_str : str
+        Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".
+
+    Returns
+    -------
+    bbox : tuple
+        Tuple (x1, y1, x2, y2).
+
+    """
+    x1, y1, x2, y2 = bbox_str.split(",")
+    x1 = float(x1)
+    y1 = float(y1)
+    x2 = float(x2)
+    y2 = float(y2)
+    # FRHTODO: do things still work if I do x1, y1, x2, y2?
+    return (
+        min(x1, x2),
+        min(y1, y2),
+        max(x1, x2),
+        max(y1, y2)
+    )
+
+
 def text_in_bbox(bbox, text):
     """Returns all text objects present inside a bounding box.
 
diff --git a/tests/data.py b/tests/data.py
index 5b66d33..a1ec2e7 100755
--- a/tests/data.py
+++ b/tests/data.py
@@ -1297,6 +1297,10 @@ data_stream_two_tables_1 = [
     ],
 ]
 
+# The streaming algorithm incorrectly includes a header and a footer.
+# Trimming the table for the test of hybrid, which doesn't include it.
+data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1]
+
 data_stream_two_tables_2 = [
     ["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
     [
@@ -1605,6 +1609,10 @@ data_stream_two_tables_2 = [
     ["1 Except forcible rape and prostitution.", "", "", "", "", ""],
 ]
 
+# The streaming algorithm incorrectly includes a header and a footer.
+# Trimming the table for the test of hybrid, which doesn't include it.
+data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
+
 data_stream_table_areas = [
     ["", "One Withholding"],
     ["Payroll Period", "Allowance"],
diff --git a/tests/test_common.py b/tests/test_common.py
index ceb71a5..38cf92a 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -175,8 +175,8 @@ def test_hybrid_table_rotated():
 
 
 def test_hybrid_two_tables():
-    df1 = pd.DataFrame(data_stream_two_tables_1)
-    df2 = pd.DataFrame(data_stream_two_tables_2)
+    df1 = pd.DataFrame(data_hybrid_two_tables_1)
+    df2 = pd.DataFrame(data_hybrid_two_tables_2)
 
     filename = os.path.join(testdir, "tabula/12s0324.pdf")
     tables = camelot.read_pdf(filename, flavor="hybrid")
@@ -190,8 +190,10 @@ def test_hybrid_table_regions():
     df = pd.DataFrame(data_stream_table_areas)
 
     filename = os.path.join(testdir, "tabula/us-007.pdf")
+    # The "stream" test looks for a region in ["320,460,573,335"], which
+    # should exclude the header.
     tables = camelot.read_pdf(
-        filename, flavor="hybrid", table_regions=["320,460,573,335"]
+        filename, flavor="hybrid", table_regions=["320,505,573,330"]
     )
     assert_frame_equal(df, tables[0].df)