From 9abdd00cec8a21505bd496a99d8899d4cdc114db Mon Sep 17 00:00:00 2001 From: Frh Date: Fri, 8 May 2020 15:08:12 -0700 Subject: [PATCH] Enable process_background option for hybrid Trim empty cols and lines --- camelot/cli.py | 2 +- camelot/parsers/base.py | 14 +++++---- camelot/parsers/hybrid.py | 14 +++++---- camelot/parsers/lattice.py | 4 +-- camelot/utils.py | 58 +++++++++++++++++++++++--------------- tests/data.py | 46 ++++++++++++++++++++++++++++++ tests/test_common.py | 9 ++++++ 7 files changed, 109 insertions(+), 38 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index e3948e0..86f5db8 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -58,7 +58,7 @@ pass_config = click.make_pass_decorator(Config) "-flag", "--flag_size", is_flag=True, - help="Flag text based on" " font size. Useful to detect super/subscripts.", + help="Flag text based on font size. Useful to detect super/subscripts.", ) @click.option( "-strip", diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index f6c9ea6..d37daf0 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -131,13 +131,15 @@ class BaseParser(): return True return False - def _initialize_new_table(self, table_idx, cols, rows): + def _initialize_new_table(self, table_idx, bbox, cols, rows): """Initialize new table object, ready to be populated Parameters ---------- table_idx : int Index of this table within the pdf page analyzed + bbox : set + bounding box of this table within the pdf page analyzed cols : list list of coordinate boundaries tuples (left, right) rows : list @@ -151,7 +153,7 @@ class BaseParser(): table = Table(cols, rows) table.page = self.page table.order = table_idx + 1 - table._bbox = self.table_bboxes()[table_idx] + table._bbox = bbox return table @staticmethod @@ -191,7 +193,7 @@ class BaseParser(): # Pure virtual, must be defined by the derived parser raise NotImplementedError() - def _generate_table(self, table_idx, cols, rows, **kwargs): + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): # Pure virtual, must be defined by the derived parser raise NotImplementedError() @@ -225,7 +227,7 @@ class BaseParser(): user_cols ) table = self._generate_table( - table_idx, cols, rows, v_s=v_s, h_s=h_s) + table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s) _tables.append(table) return _tables @@ -467,8 +469,8 @@ class TextBaseParser(BaseParser): raise ValueError("Length of table_areas and columns" " should be equal") - def _generate_table(self, table_idx, cols, rows, **kwargs): - table = self._initialize_new_table(table_idx, cols, rows) + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): + table = self._initialize_new_table(table_idx, bbox, cols, rows) table = table.set_all_edges() self.record_parse_metadata(table) diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index d6fe13a..c18ef29 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -104,10 +104,15 @@ class Hybrid(BaseParser): parser = self.table_bbox_parses[bbox] return parser._generate_columns_and_rows(bbox, table_idx) - def _generate_table(self, table_idx, cols, rows, **kwargs): - bbox = self.table_bboxes()[table_idx] + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): parser = self.table_bbox_parses[bbox] - return parser._generate_table(table_idx, cols, rows, **kwargs) + table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs) + # Because hybrid can inject extraneous splits from both lattice and + # network, remove lines / cols that are completely empty. + df = table.df + df[df.astype(bool)].dropna(axis=0, how="all", inplace=True) + df[df.astype(bool)].dropna(axis=1, how="all", inplace=True) + return table @staticmethod def _augment_boundaries_with_splits(boundaries, splits, tolerance=0): @@ -223,6 +228,3 @@ class Hybrid(BaseParser): # Add the bboxes from network that haven't been merged for network_bbox in _network_bboxes: self.table_bbox_parses[network_bbox] = self.network_parser - - def record_parse_metadata(self, table): - super().record_parse_metadata(table) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 05bd649..1e6ed30 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -322,13 +322,13 @@ class Lattice(BaseParser): ] return cols, rows, v_s, h_s - def _generate_table(self, table_idx, cols, rows, **kwargs): + def _generate_table(self, table_idx, bbox, cols, rows, **kwargs): v_s = kwargs.get("v_s") h_s = kwargs.get("h_s") if v_s is None or h_s is None: raise ValueError("No segments found on {}".format(self.rootname)) - table = self._initialize_new_table(table_idx, cols, rows) + table = self._initialize_new_table(table_idx, bbox, cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol) # set table border edges to True diff --git a/camelot/utils.py b/camelot/utils.py index d24230d..d91d556 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -110,9 +110,21 @@ def download_url(url): shutil.move(f.name, filepath) return filepath - -stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] -lattice_kwargs = [ +common_kwargs = [ + "flag_size", + "margins", + "split_text", + "strip_text", + "table_areas", + "table_regions" +] +text_kwargs = common_kwargs + [ + "columns", + "edge_tol", + "row_tol", + "column_tol" +] +lattice_kwargs = common_kwargs+ [ "process_background", "line_scale", "copy_text", @@ -124,33 +136,33 @@ lattice_kwargs = [ "iterations", "resolution", ] +flavor_to_kwargs = { + "stream": text_kwargs, + "network": text_kwargs, + "lattice": lattice_kwargs, + "hybrid": text_kwargs + lattice_kwargs, +} def validate_input(kwargs, flavor="lattice"): - def check_intersection(parser_kwargs, input_kwargs): - isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) - if isec: - raise ValueError( - "{} cannot be used with flavor='{}'".format( - ",".join(sorted(isec)), flavor - ) + parser_kwargs = flavor_to_kwargs[flavor] + # s.difference(t): new set with elements in s but not in t + isec = set(kwargs.keys()).difference(set(parser_kwargs)) + if isec: + raise ValueError( + "{} cannot be used with flavor='{}'".format( + ",".join(sorted(isec)), flavor ) - - if flavor == "lattice": - check_intersection(stream_kwargs, kwargs) - else: - check_intersection(lattice_kwargs, kwargs) + ) def remove_extra(kwargs, flavor="lattice"): - if flavor == "lattice": - for key in kwargs.keys(): - if key in stream_kwargs: - kwargs.pop(key) - else: - for key in kwargs.keys(): - if key in lattice_kwargs: - kwargs.pop(key) + parser_kwargs = flavor_to_kwargs[flavor] + # Avoid "dictionary changed size during iteration" + kwargs_keys = list(kwargs.keys()) + for key in kwargs_keys: + if key not in parser_kwargs: + kwargs.pop(key) return kwargs diff --git a/tests/data.py b/tests/data.py index c32ee28..844fc70 100755 --- a/tests/data.py +++ b/tests/data.py @@ -3701,6 +3701,52 @@ data_lattice_process_background = [ ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], ] +data_hybrid_process_background = [ + [ + "State", + "Date", + "Halt", + "Halt", + "Persons", + "Persons", + "Persons", + "Persons", + ], + ["", "", "stations", "days", "directly", "trained", "counseled", "tested"], + ["", "", "", "", "reached", "", "", "for HIV"], + ["", "", "", "", "(in lakh)", "", "", ""], + ["Delhi", "1.12.2009", "8", "17", "1.29", "3,665", "2,409", "1,000"], + ["Rajasthan", "2.12.2009 to", "", "", "", "", "", ""], + ["", "19.12.2009", "", "", "", "", "", ""], + ["Gujarat", "20.12.2009 to", "6", "13", "6.03", "3,810", "2,317", "1,453"], + ["", "3.1.2010", "", "", "", "", "", ""], + [ + "Maharashtra", + "4.01.2010 to", + "13", + "26", + "1.27", + "5,680", + "9,027", + "4,153", + ], + ["", "1.2.2010", "", "", "", "", "", ""], + [ + "Karnataka", + "2.2.2010 to", + "11", + "19", + "1.80", + "5,741", + "3,658", + "3,183", + ], + ["", "22.2.2010", "", "", "", "", "", ""], + ["Kerala", "23.2.2010 to", "9", "17", "1.42", "3,559", "2,173", "855"], + ["", "11.3.2010", "", "", "", "", "", ""], + ["Total", "", "47", "92", "11.81", "22,455", "19,584", "10,644"], +] + data_lattice_copy_text = [ ["Plan Type", "County", "Plan Name", "Totals"], ["GMC", "Sacramento", "Anthem Blue Cross", "164,380"], diff --git a/tests/test_common.py b/tests/test_common.py index 1260a39..31d3681 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -321,6 +321,15 @@ def test_hybrid_vertical_header(): assert_frame_equal(df, tables[0].df) +def test_hybrid_process_background(): + df = pd.DataFrame(data_hybrid_process_background) + + filename = os.path.join(testdir, "background_lines_1.pdf") + tables = camelot.read_pdf( + filename, flavor="hybrid", process_background=True) + assert_frame_equal(df, tables[1].df) + + # Lattice parser tests def test_lattice(): df = pd.DataFrame(data_lattice)