diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3749028..39a0464 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -121,6 +121,7 @@ class Stream(BaseParser): row_y = 0 rows = [] temp = [] + for t in text: # is checking for upright necessary? # if t.get_text().strip() and all([obj.upright for obj in t._objs if @@ -131,8 +132,10 @@ class Stream(BaseParser): temp = [] row_y = t.y0 temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # TODO: hacky + if len(rows) > 1: + __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -345,43 +348,46 @@ class Stream(BaseParser): else: # calculate mode of the list of number of elements in # each row to guess the number of columns - ncols = max(set(elements), key=elements.count) - if ncols == 1: - # if mode is 1, the page usually contains not tables - # but there can be cases where the list can be skewed, - # try to remove all 1s from list in this case and - # see if the list contains elements, if yes, then use - # the mode after removing 1s - elements = list(filter(lambda x: x != 1, elements)) - if len(elements): - ncols = max(set(elements), key=elements.count) - else: - warnings.warn( - f"No tables found in table area {table_idx + 1}" + if not len(elements): + cols = [(text_x_min, text_x_max)] + else: + ncols = max(set(elements), key=elements.count) + if ncols == 1: + # if mode is 1, the page usually contains not tables + # but there can be cases where the list can be skewed, + # try to remove all 1s from list in this case and + # see if the list contains elements, if yes, then use + # the mode after removing 1s + elements = list(filter(lambda x: x != 1, elements)) + if len(elements): + ncols = max(set(elements), key=elements.count) + else: + warnings.warn( + f"No tables found in table area {table_idx + 1}" + ) + cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] + cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend( + [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > left and t.x1 < right + ] ) - cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) - inner_text = [] - for i in range(1, len(cols)): - left = cols[i - 1][1] - right = cols[i][0] - inner_text.extend( - [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > left and t.x1 < right - ] - ) - outer_text = [ - t - for direction in self.t_bbox - for t in self.t_bbox[direction] - if t.x0 > cols[-1][1] or t.x1 < cols[0][0] - ] - inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.row_tol) - cols = self._join_columns(cols, text_x_min, text_x_max) + outer_text = [ + t + for direction in self.t_bbox + for t in self.t_bbox[direction] + if t.x0 > cols[-1][1] or t.x1 < cols[0][0] + ] + inner_text.extend(outer_text) + cols = self._add_columns(cols, inner_text, self.row_tol) + cols = self._join_columns(cols, text_x_min, text_x_max) return cols, rows diff --git a/tests/files/blank.pdf b/tests/files/blank.pdf deleted file mode 100755 index 99540f1..0000000 Binary files a/tests/files/blank.pdf and /dev/null differ diff --git a/tests/files/empty.pdf b/tests/files/empty.pdf new file mode 100644 index 0000000..52aeefb Binary files /dev/null and b/tests/files/empty.pdf differ diff --git a/tests/files/only_page_number.pdf b/tests/files/only_page_number.pdf new file mode 100644 index 0000000..7b4ecfe Binary files /dev/null and b/tests/files/only_page_number.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py index cddc9a2..f897315 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -160,8 +160,8 @@ def test_cli_output_format(): def test_cli_quiet(): with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, "blank.pdf") - outfile = os.path.join(tempdir, "blank.csv") + infile = os.path.join(testdir, "empty.pdf") + outfile = os.path.join(tempdir, "empty.csv") runner = CliRunner() result = runner.invoke( diff --git a/tests/test_errors.py b/tests/test_errors.py index 2849110..595c54b 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -55,15 +55,33 @@ def test_image_warning(): ) -def test_no_tables_found(): - filename = os.path.join(testdir, "blank.pdf") +def test_lattice_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") with warnings.catch_warnings(): warnings.simplefilter("error") with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename) + tables = camelot.read_pdf(filename, flavor="lattice") assert str(e.value) == "No tables found on page-1" +def test_stream_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found on page-1" + + +def test_stream_no_tables_in_area(): + filename = os.path.join(testdir, "only_page_number.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found in table area 1" + + def test_no_tables_found_logs_suppressed(): filename = os.path.join(testdir, "foo.pdf") with warnings.catch_warnings(): @@ -77,7 +95,7 @@ def test_no_tables_found_logs_suppressed(): def test_no_tables_found_warnings_suppressed(): - filename = os.path.join(testdir, "blank.pdf") + filename = os.path.join(testdir, "empty.pdf") with warnings.catch_warnings(): # the test should fail if any warning is thrown warnings.simplefilter("error")