From dbabc5b1c12fea7a8869c018319655dfb6b2dff9 Mon Sep 17 00:00:00 2001 From: Idan David Date: Sun, 5 Apr 2020 13:38:10 +0300 Subject: [PATCH 1/5] Add option to define "rows" in Stream Similarly to the "columns" parameter, this commit enables to also define "rows" visual dividers --- camelot/parsers/stream.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 33f2fe5..98a8a83 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -66,6 +66,7 @@ class Stream(BaseParser): edge_tol=50, row_tol=2, column_tol=0, + rows=None, **kwargs ): self.table_regions = table_regions @@ -78,6 +79,7 @@ class Stream(BaseParser): self.edge_tol = edge_tol self.row_tol = row_tol self.column_tol = column_tol + self.rows = rows @staticmethod def _text_bbox(t_bbox): @@ -330,7 +332,14 @@ class Stream(BaseParser): text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + if self.rows is not None and self.rows[table_idx] != "": + rows = self.rows[table_idx].split(",") + rows = [float(c) for c in rows] + rows.insert(0, text_y_max) + rows.append(text_y_min) + rows = [(rows[i], rows[i+1]) for i in range(0, len(rows) - 1)] + else: + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] if self.columns is not None and self.columns[table_idx] != "": From 2a8c24e672c97400b8aa88e0e2787ec51403aa4a Mon Sep 17 00:00:00 2001 From: Idan David Date: Sun, 5 Apr 2020 13:49:44 +0300 Subject: [PATCH 2/5] add documentation --- camelot/io.py | 3 +++ camelot/parsers/stream.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/camelot/io.py b/camelot/io.py index a27a7c6..00542d1 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -43,6 +43,9 @@ def read_pdf( columns^ : list, optional (default: None) List of column x-coordinates strings where the coordinates are comma-separated. + rows^ : list, optional (default: None) + List of rows y-coordinates strings where the coordinates + are comma-separated. split_text : bool, optional (default: False) Split text that spans across multiple cells. flag_size : bool, optional (default: False) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 98a8a83..deb61eb 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -20,7 +20,7 @@ class Stream(BaseParser): """Stream method of parsing looks for spaces between text to parse the table. - If you want to specify columns when specifying multiple table + If you want to specify rows or columns when specifying multiple table areas, make sure that the length of both lists are equal. Parameters @@ -52,6 +52,9 @@ class Stream(BaseParser): column_tol : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. + rows : list, optional (default: None) + List of rows y-coordinates strings where the coordinates + are comma-separated """ From 23b40fc9950ebe16c109f0693ca6a69248d13e54 Mon Sep 17 00:00:00 2001 From: Idan David Date: Sun, 5 Apr 2020 13:50:01 +0300 Subject: [PATCH 3/5] Add command line options --- camelot/cli.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/camelot/cli.py b/camelot/cli.py index 0298992..1a7c0be 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -237,6 +237,13 @@ def lattice(c, *args, **kwargs): multiple=True, help="X coordinates of column separators.", ) +@click.option( + "-R", + "--rows", + default=[], + multiple=True, + help="Y coordinates of rows separators.", +) @click.option( "-e", "--edge_tol", @@ -282,6 +289,8 @@ def stream(c, *args, **kwargs): kwargs["table_areas"] = None if not table_areas else table_areas columns = list(kwargs["columns"]) kwargs["columns"] = None if not columns else columns + rows = list(kwargs["rows"]) + kwargs["rows"] = None if not rows else rows if plot_type is not None: if not _HAS_MPL: From 71e6a4cb0688366e5488c386214c9cb8111c0d90 Mon Sep 17 00:00:00 2001 From: Idan David Date: Sun, 5 Apr 2020 13:51:36 +0300 Subject: [PATCH 4/5] Add rows parameter to validate input --- camelot/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/camelot/utils.py b/camelot/utils.py index e7ad848..fc3e773 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -103,7 +103,7 @@ def download_url(url): return filepath -stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] +stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol", "rows"] lattice_kwargs = [ "process_background", "line_scale", From ba1604ee401aa133a940cd338381fefa316ade85 Mon Sep 17 00:00:00 2001 From: Idan David Date: Sun, 5 Apr 2020 20:01:29 +0300 Subject: [PATCH 5/5] Fix deepsource problems --- camelot/parsers/stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index deb61eb..ac1f774 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -289,7 +289,7 @@ class Stream(BaseParser): # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not len(table_bbox): + if not table_bbox: table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} return table_bbox @@ -366,7 +366,7 @@ class Stream(BaseParser): # see if the list contains elements, if yes, then use # the mode after removing 1s elements = list(filter(lambda x: x != 1, elements)) - if len(elements): + if elements: ncols = max(set(elements), key=elements.count) else: warnings.warn(