From 9abdd00cec8a21505bd496a99d8899d4cdc114db Mon Sep 17 00:00:00 2001
From: Frh <francois.huet+github@gmail.com>
Date: Fri, 8 May 2020 15:08:12 -0700
Subject: [PATCH] Enable process_background option for hybrid

Trim empty cols and lines
---
 camelot/cli.py             |  2 +-
 camelot/parsers/base.py    | 14 +++++----
 camelot/parsers/hybrid.py  | 14 +++++----
 camelot/parsers/lattice.py |  4 +--
 camelot/utils.py           | 58 +++++++++++++++++++++++---------------
 tests/data.py              | 46 ++++++++++++++++++++++++++++++
 tests/test_common.py       |  9 ++++++
 7 files changed, 109 insertions(+), 38 deletions(-)

diff --git a/camelot/cli.py b/camelot/cli.py
index e3948e0..86f5db8 100644
--- a/camelot/cli.py
+++ b/camelot/cli.py
@@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
     "-flag",
     "--flag_size",
     is_flag=True,
-    help="Flag text based on" " font size. Useful to detect super/subscripts.",
+    help="Flag text based on font size. Useful to detect super/subscripts.",
 )
 @click.option(
     "-strip",
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index f6c9ea6..d37daf0 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -131,13 +131,15 @@ class BaseParser():
             return True
         return False
 
-    def _initialize_new_table(self, table_idx, cols, rows):
+    def _initialize_new_table(self, table_idx, bbox, cols, rows):
         """Initialize new table object, ready to be populated
 
         Parameters
         ----------
         table_idx : int
             Index of this table within the pdf page analyzed
+        bbox : set
+            bounding box of this table within the pdf page analyzed
         cols : list
             list of coordinate boundaries tuples (left, right)
         rows : list
@@ -151,7 +153,7 @@ class BaseParser():
         table = Table(cols, rows)
         table.page = self.page
         table.order = table_idx + 1
-        table._bbox = self.table_bboxes()[table_idx]
+        table._bbox = bbox
         return table
 
     @staticmethod
@@ -191,7 +193,7 @@ class BaseParser():
         # Pure virtual, must be defined by the derived parser
         raise NotImplementedError()
 
-    def _generate_table(self, table_idx, cols, rows, **kwargs):
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
         # Pure virtual, must be defined by the derived parser
         raise NotImplementedError()
 
@@ -225,7 +227,7 @@ class BaseParser():
                 user_cols
             )
             table = self._generate_table(
-                table_idx, cols, rows, v_s=v_s, h_s=h_s)
+                table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
             _tables.append(table)
 
         return _tables
@@ -467,8 +469,8 @@ class TextBaseParser(BaseParser):
                 raise ValueError("Length of table_areas and columns"
                                  " should be equal")
 
-    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        table = self._initialize_new_table(table_idx, cols, rows)
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
+        table = self._initialize_new_table(table_idx, bbox, cols, rows)
         table = table.set_all_edges()
         self.record_parse_metadata(table)
 
diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py
index d6fe13a..c18ef29 100644
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@@ -104,10 +104,15 @@ class Hybrid(BaseParser):
         parser = self.table_bbox_parses[bbox]
         return parser._generate_columns_and_rows(bbox, table_idx)
 
-    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        bbox = self.table_bboxes()[table_idx]
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
         parser = self.table_bbox_parses[bbox]
-        return parser._generate_table(table_idx, cols, rows, **kwargs)
+        table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
+        # Because hybrid can inject extraneous splits from both lattice and
+        # network, remove lines / cols that are completely empty.
+        df = table.df
+        df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
+        df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
+        return table
 
     @staticmethod
     def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
@@ -223,6 +228,3 @@ class Hybrid(BaseParser):
         # Add the bboxes from network that haven't been merged
         for network_bbox in _network_bboxes:
             self.table_bbox_parses[network_bbox] = self.network_parser
-
-    def record_parse_metadata(self, table):
-        super().record_parse_metadata(table)
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index 05bd649..1e6ed30 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -322,13 +322,13 @@ class Lattice(BaseParser):
         ]
         return cols, rows, v_s, h_s
 
-    def _generate_table(self, table_idx, cols, rows, **kwargs):
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
         v_s = kwargs.get("v_s")
         h_s = kwargs.get("h_s")
         if v_s is None or h_s is None:
             raise ValueError("No segments found on {}".format(self.rootname))
 
-        table = self._initialize_new_table(table_idx, cols, rows)
+        table = self._initialize_new_table(table_idx, bbox, cols, rows)
         # set table edges to True using ver+hor lines
         table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
         # set table border edges to True
diff --git a/camelot/utils.py b/camelot/utils.py
index d24230d..d91d556 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -110,9 +110,21 @@ def download_url(url):
     shutil.move(f.name, filepath)
     return filepath
 
-
-stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
-lattice_kwargs = [
+common_kwargs = [
+    "flag_size",
+    "margins",
+    "split_text",
+    "strip_text",
+    "table_areas",
+    "table_regions"
+]
+text_kwargs = common_kwargs + [
+    "columns",
+    "edge_tol",
+    "row_tol",
+    "column_tol"
+]
+lattice_kwargs = common_kwargs+ [
     "process_background",
     "line_scale",
     "copy_text",
@@ -124,33 +136,33 @@ lattice_kwargs = [
     "iterations",
     "resolution",
 ]
+flavor_to_kwargs = {
+    "stream": text_kwargs,
+    "network": text_kwargs,
+    "lattice": lattice_kwargs,
+    "hybrid": text_kwargs + lattice_kwargs,
+}
 
 
 def validate_input(kwargs, flavor="lattice"):
-    def check_intersection(parser_kwargs, input_kwargs):
-        isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
-        if isec:
-            raise ValueError(
-                "{} cannot be used with flavor='{}'".format(
-                    ",".join(sorted(isec)), flavor
-                )
+    parser_kwargs = flavor_to_kwargs[flavor]
+    # s.difference(t): new set with elements in s but not in t
+    isec = set(kwargs.keys()).difference(set(parser_kwargs))
+    if isec:
+        raise ValueError(
+            "{} cannot be used with flavor='{}'".format(
+                ",".join(sorted(isec)), flavor
             )
-
-    if flavor == "lattice":
-        check_intersection(stream_kwargs, kwargs)
-    else:
-        check_intersection(lattice_kwargs, kwargs)
+        )
 
 
 def remove_extra(kwargs, flavor="lattice"):
-    if flavor == "lattice":
-        for key in kwargs.keys():
-            if key in stream_kwargs:
-                kwargs.pop(key)
-    else:
-        for key in kwargs.keys():
-            if key in lattice_kwargs:
-                kwargs.pop(key)
+    parser_kwargs = flavor_to_kwargs[flavor]
+    # Avoid "dictionary changed size during iteration"
+    kwargs_keys = list(kwargs.keys())
+    for key in kwargs_keys:
+        if key not in parser_kwargs:
+            kwargs.pop(key)
     return kwargs
 
 
diff --git a/tests/data.py b/tests/data.py
index c32ee28..844fc70 100755
--- a/tests/data.py
+++ b/tests/data.py
@@ -3701,6 +3701,52 @@ data_lattice_process_background = [
     ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
 ]
 
+data_hybrid_process_background = [
+    [
+        "State",
+        "Date",
+        "Halt",
+        "Halt",
+        "Persons",
+        "Persons",
+        "Persons",
+        "Persons",
+    ],
+    ["", "", "stations", "days", "directly", "trained", "counseled", "tested"],
+    ["", "", "", "", "reached", "", "", "for HIV"],
+    ["", "", "", "", "(in lakh)", "", "", ""],
+    ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
+    ["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""],
+    ["", "19.12.2009", "", "", "", "", "", ""],
+    ["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"],
+    ["", "3.1.2010", "", "", "", "", "", ""],
+    [
+        "Maharashtra",
+        "4.01.2010 to",
+        "13",
+        "26",
+        "1.27",
+        "5,680",
+        "9,027",
+        "4,153",
+    ],
+    ["", "1.2.2010", "", "", "", "", "", ""],
+    [
+        "Karnataka",
+        "2.2.2010 to",
+        "11",
+        "19",
+        "1.80",
+        "5,741",
+        "3,658",
+        "3,183",
+    ],
+    ["", "22.2.2010", "", "", "", "", "", ""],
+    ["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"],
+    ["", "11.3.2010", "", "", "", "", "", ""],
+    ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
+]
+
 data_lattice_copy_text = [
     ["Plan Type", "County", "Plan Name", "Totals"],
     ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],
diff --git a/tests/test_common.py b/tests/test_common.py
index 1260a39..31d3681 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -321,6 +321,15 @@ def test_hybrid_vertical_header():
     assert_frame_equal(df, tables[0].df)
 
 
+def test_hybrid_process_background():
+    df = pd.DataFrame(data_hybrid_process_background)
+
+    filename = os.path.join(testdir, "background_lines_1.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", process_background=True)
+    assert_frame_equal(df, tables[1].df)
+
+
 # Lattice parser tests
 def test_lattice():
     df = pd.DataFrame(data_lattice)