diff --git a/camelot/cli.py b/camelot/cli.py index 1715e6f..4fc68df 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -237,6 +237,13 @@ def lattice(c, *args, **kwargs): multiple=True, help="X coordinates of column separators.", ) +@click.option( + "-R", + "--rows", + default=[], + multiple=True, + help="Y coordinates of rows separators.", +) @click.option( "-e", "--edge_tol", @@ -282,6 +289,8 @@ def stream(c, *args, **kwargs): kwargs["table_areas"] = None if not table_areas else table_areas columns = list(kwargs["columns"]) kwargs["columns"] = None if not columns else columns + rows = list(kwargs["rows"]) + kwargs["rows"] = None if not rows else rows if plot_type is not None: if not _HAS_MPL: diff --git a/camelot/io.py b/camelot/io.py index a27a7c6..00542d1 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -43,6 +43,9 @@ def read_pdf( columns^ : list, optional (default: None) List of column x-coordinates strings where the coordinates are comma-separated. + rows^ : list, optional (default: None) + List of rows y-coordinates strings where the coordinates + are comma-separated. split_text : bool, optional (default: False) Split text that spans across multiple cells. flag_size : bool, optional (default: False) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3749028..12c7c3b 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -19,7 +19,7 @@ class Stream(BaseParser): """Stream method of parsing looks for spaces between text to parse the table. - If you want to specify columns when specifying multiple table + If you want to specify rows or columns when specifying multiple table areas, make sure that the length of both lists are equal. Parameters @@ -51,6 +51,9 @@ class Stream(BaseParser): column_tol : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. + rows : list, optional (default: None) + List of rows y-coordinates strings where the coordinates + are comma-separated """ @@ -65,6 +68,7 @@ class Stream(BaseParser): edge_tol=50, row_tol=2, column_tol=0, + rows=None, **kwargs ): self.table_regions = table_regions @@ -77,6 +81,7 @@ class Stream(BaseParser): self.edge_tol = edge_tol self.row_tol = row_tol self.column_tol = column_tol + self.rows = rows @staticmethod def _text_bbox(t_bbox): @@ -283,7 +288,7 @@ class Stream(BaseParser): # guess table areas using textlines and relevant edges table_bbox = textedges.get_table_areas(textlines, relevant_textedges) # treat whole page as table area if no table areas found - if not len(table_bbox): + if not table_bbox: table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} return table_bbox @@ -329,7 +334,14 @@ class Stream(BaseParser): text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol) - rows = self._join_rows(rows_grouped, text_y_max, text_y_min) + if self.rows is not None and self.rows[table_idx] != "": + rows = self.rows[table_idx].split(",") + rows = [float(c) for c in rows] + rows.insert(0, text_y_max) + rows.append(text_y_min) + rows = [(rows[i], rows[i+1]) for i in range(0, len(rows) - 1)] + else: + rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] if self.columns is not None and self.columns[table_idx] != "": @@ -353,7 +365,7 @@ class Stream(BaseParser): # see if the list contains elements, if yes, then use # the mode after removing 1s elements = list(filter(lambda x: x != 1, elements)) - if len(elements): + if elements: ncols = max(set(elements), key=elements.count) else: warnings.warn( diff --git a/camelot/utils.py b/camelot/utils.py index 83974f9..9d96ae5 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -93,7 +93,7 @@ def download_url(url): return filepath -stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] +stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol", "rows"] lattice_kwargs = [ "process_background", "line_scale",