From 52adbbd79660a962d4d19b00954e0f19d375b407 Mon Sep 17 00:00:00 2001 From: Jose Vargas Date: Sun, 26 Jan 2020 20:12:06 -0500 Subject: [PATCH] [parsers.stream] - Use fall back column coordinates. The Stream class would raise an IndexError when the 'columns' argument was specified and the number of tables identified was larger than the number of items in the 'columns' argument. This IndexError makes extracting tables from a PDF comprised mainly of known, consistent table structures of interest to the caller, but that may be variable in height, starting position, or number, rather cumbersome with the Stream parser. This is especially true within an automated or programmatic context. Either the caller must call 'camelot.read_pdf' once per page, or manipulate the 'columns' argument so as to avoid the IndexError. The former isn't guaranteed to work, as a single page can contain multiple tables, and therefore, in such a situation, the caller must resort to the latter even if extracting tables from a single page. The Stream class continues to function exactly the same when the 'table_areas' argument is provided; this commit only changes the behavior of the Stream parser when 'table_areas' is not provided. This commit allows all tables to be easily extracted by specifying 'pages=all' and providing the appropriate 'columns' argument value to 'camelot.read_pdf'. Extracting all tables from such a PDF is already possible with the Lattice parser, this commit makes this possible with the Stream parser as well. Callers are responsible for filtering out any extraneous tables. --- camelot/parsers/stream.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 33f2fe5..b6046aa 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -333,16 +333,18 @@ class Stream(BaseParser): rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] - if self.columns is not None and self.columns[table_idx] != "": - # user has to input boundary columns too - # take (0, pdf_width) by default - # similar to else condition - # len can't be 1 - cols = self.columns[table_idx].split(",") - cols = [float(c) for c in cols] - cols.insert(0, text_x_min) - cols.append(text_x_max) - cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] + if self.columns is not None: + column_idx = table_idx if table_idx < len(self.columns) else -1 + if self.columns[column_idx] != "": + # user has to input boundary columns too + # take (0, pdf_width) by default + # similar to else condition + # len can't be 1 + cols = self.columns[column_idx].split(",") + cols = [float(c) for c in cols] + cols.insert(0, text_x_min) + cols.append(text_x_max) + cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] else: # calculate mode of the list of number of elements in # each row to guess the number of columns