Enable process_background option for hybrid

Trim empty cols and lines
2020-05-08 15:08:12 -07:00 · 2020-05-08 15:08:12 -07:00 · 9abdd00cec
parent 63adfd5468
commit 9abdd00cec
7 changed files with 109 additions and 38 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
    "-flag",
    "--flag_size",
    is_flag=True,
-    help="Flag text based on" " font size. Useful to detect super/subscripts.",
+    help="Flag text based on font size. Useful to detect super/subscripts.",
 )
@click.option(
    "-strip",
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -131,13 +131,15 @@ class BaseParser():
            return True
        return False

-    def _initialize_new_table(self, table_idx, cols, rows):
+    def _initialize_new_table(self, table_idx, bbox, cols, rows):
        """Initialize new table object, ready to be populated

        Parameters
        ----------
        table_idx : int
            Index of this table within the pdf page analyzed
+        bbox : set
+            bounding box of this table within the pdf page analyzed
        cols : list
            list of coordinate boundaries tuples (left, right)
        rows : list
@ -151,7 +153,7 @@ class BaseParser():
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
-        table._bbox = self.table_bboxes()[table_idx]
+        table._bbox = bbox
        return table

    @staticmethod
@ -191,7 +193,7 @@ class BaseParser():
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()

-    def _generate_table(self, table_idx, cols, rows, **kwargs):
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()

@ -225,7 +227,7 @@ class BaseParser():
                user_cols
            )
            table = self._generate_table(
-                table_idx, cols, rows, v_s=v_s, h_s=h_s)
+                table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
            _tables.append(table)

        return _tables
@ -467,8 +469,8 @@ class TextBaseParser(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")

-    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        table = self._initialize_new_table(table_idx, cols, rows)
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
+        table = self._initialize_new_table(table_idx, bbox, cols, rows)
        table = table.set_all_edges()
        self.record_parse_metadata(table)

--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -104,10 +104,15 @@ class Hybrid(BaseParser):
        parser = self.table_bbox_parses[bbox]
        return parser._generate_columns_and_rows(bbox, table_idx)

-    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        bbox = self.table_bboxes()[table_idx]
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        parser = self.table_bbox_parses[bbox]
-        return parser._generate_table(table_idx, cols, rows, **kwargs)
+        table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
+        # Because hybrid can inject extraneous splits from both lattice and
+        # network, remove lines / cols that are completely empty.
+        df = table.df
+        df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
+        df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
+        return table

    @staticmethod
    def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
@ -223,6 +228,3 @@ class Hybrid(BaseParser):
        # Add the bboxes from network that haven't been merged
        for network_bbox in _network_bboxes:
            self.table_bbox_parses[network_bbox] = self.network_parser
-
-    def record_parse_metadata(self, table):
-        super().record_parse_metadata(table)
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -322,13 +322,13 @@ class Lattice(BaseParser):
        ]
        return cols, rows, v_s, h_s

-    def _generate_table(self, table_idx, cols, rows, **kwargs):
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        v_s = kwargs.get("v_s")
        h_s = kwargs.get("h_s")
        if v_s is None or h_s is None:
            raise ValueError("No segments found on {}".format(self.rootname))

-        table = self._initialize_new_table(table_idx, cols, rows)
+        table = self._initialize_new_table(table_idx, bbox, cols, rows)
        # set table edges to True using ver+hor lines
        table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
        # set table border edges to True
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -110,9 +110,21 @@ def download_url(url):
    shutil.move(f.name, filepath)
    return filepath

-
-stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
-lattice_kwargs = [
+common_kwargs = [
+    "flag_size",
+    "margins",
+    "split_text",
+    "strip_text",
+    "table_areas",
+    "table_regions"
+]
+text_kwargs = common_kwargs + [
+    "columns",
+    "edge_tol",
+    "row_tol",
+    "column_tol"
+]
+lattice_kwargs = common_kwargs+ [
    "process_background",
    "line_scale",
    "copy_text",
@ -124,33 +136,33 @@ lattice_kwargs = [
    "iterations",
    "resolution",
 ]
+flavor_to_kwargs = {
+    "stream": text_kwargs,
+    "network": text_kwargs,
+    "lattice": lattice_kwargs,
+    "hybrid": text_kwargs + lattice_kwargs,
+}


 def validate_input(kwargs, flavor="lattice"):
-    def check_intersection(parser_kwargs, input_kwargs):
-        isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
-        if isec:
-            raise ValueError(
-                "{} cannot be used with flavor='{}'".format(
-                    ",".join(sorted(isec)), flavor
-                )
+    parser_kwargs = flavor_to_kwargs[flavor]
+    # s.difference(t): new set with elements in s but not in t
+    isec = set(kwargs.keys()).difference(set(parser_kwargs))
+    if isec:
+        raise ValueError(
+            "{} cannot be used with flavor='{}'".format(
+                ",".join(sorted(isec)), flavor
            )
-
-    if flavor == "lattice":
-        check_intersection(stream_kwargs, kwargs)
-    else:
-        check_intersection(lattice_kwargs, kwargs)
+        )


 def remove_extra(kwargs, flavor="lattice"):
-    if flavor == "lattice":
-        for key in kwargs.keys():
-            if key in stream_kwargs:
-                kwargs.pop(key)
-    else:
-        for key in kwargs.keys():
-            if key in lattice_kwargs:
-                kwargs.pop(key)
+    parser_kwargs = flavor_to_kwargs[flavor]
+    # Avoid "dictionary changed size during iteration"
+    kwargs_keys = list(kwargs.keys())
+    for key in kwargs_keys:
+        if key not in parser_kwargs:
+            kwargs.pop(key)
    return kwargs


--- a/tests/data.py
+++ b/tests/data.py
@ -3701,6 +3701,52 @@ data_lattice_process_background = [
    ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
 ]

+data_hybrid_process_background = [
+    [
+        "State",
+        "Date",
+        "Halt",
+        "Halt",
+        "Persons",
+        "Persons",
+        "Persons",
+        "Persons",
+    ],
+    ["", "", "stations", "days", "directly", "trained", "counseled", "tested"],
+    ["", "", "", "", "reached", "", "", "for HIV"],
+    ["", "", "", "", "(in lakh)", "", "", ""],
+    ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"],
+    ["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""],
+    ["", "19.12.2009", "", "", "", "", "", ""],
+    ["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"],
+    ["", "3.1.2010", "", "", "", "", "", ""],
+    [
+        "Maharashtra",
+        "4.01.2010 to",
+        "13",
+        "26",
+        "1.27",
+        "5,680",
+        "9,027",
+        "4,153",
+    ],
+    ["", "1.2.2010", "", "", "", "", "", ""],
+    [
+        "Karnataka",
+        "2.2.2010 to",
+        "11",
+        "19",
+        "1.80",
+        "5,741",
+        "3,658",
+        "3,183",
+    ],
+    ["", "22.2.2010", "", "", "", "", "", ""],
+    ["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"],
+    ["", "11.3.2010", "", "", "", "", "", ""],
+    ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"],
+]
+
 data_lattice_copy_text = [
    ["Plan Type", "County", "Plan Name", "Totals"],
    ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"],
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -321,6 +321,15 @@ def test_hybrid_vertical_header():
    assert_frame_equal(df, tables[0].df)


+def test_hybrid_process_background():
+    df = pd.DataFrame(data_hybrid_process_background)
+
+    filename = os.path.join(testdir, "background_lines_1.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", process_background=True)
+    assert_frame_equal(df, tables[1].df)
+
+
 # Lattice parser tests
 def test_lattice():
    df = pd.DataFrame(data_lattice)