Prevent taking max of an empty set

pull/189/head
Vinayak Mehta 2020-08-25 22:50:31 +05:30
parent 9087429501
commit 5d20d56e48
No known key found for this signature in database
GPG Key ID: 2170CDB940114C1D
6 changed files with 67 additions and 43 deletions

View File

@ -121,6 +121,7 @@ class Stream(BaseParser):
row_y = 0 row_y = 0
rows = [] rows = []
temp = [] temp = []
for t in text: for t in text:
# is checking for upright necessary? # is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if # if t.get_text().strip() and all([obj.upright for obj in t._objs if
@ -131,8 +132,10 @@ class Stream(BaseParser):
temp = [] temp = []
row_y = t.y0 row_y = t.y0
temp.append(t) temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # TODO: hacky if len(rows) > 1:
__ = rows.pop(0) # TODO: hacky
return rows return rows
@staticmethod @staticmethod
@ -345,43 +348,46 @@ class Stream(BaseParser):
else: else:
# calculate mode of the list of number of elements in # calculate mode of the list of number of elements in
# each row to guess the number of columns # each row to guess the number of columns
ncols = max(set(elements), key=elements.count) if not len(elements):
if ncols == 1: cols = [(text_x_min, text_x_max)]
# if mode is 1, the page usually contains not tables else:
# but there can be cases where the list can be skewed, ncols = max(set(elements), key=elements.count)
# try to remove all 1s from list in this case and if ncols == 1:
# see if the list contains elements, if yes, then use # if mode is 1, the page usually contains not tables
# the mode after removing 1s # but there can be cases where the list can be skewed,
elements = list(filter(lambda x: x != 1, elements)) # try to remove all 1s from list in this case and
if len(elements): # see if the list contains elements, if yes, then use
ncols = max(set(elements), key=elements.count) # the mode after removing 1s
else: elements = list(filter(lambda x: x != 1, elements))
warnings.warn( if len(elements):
f"No tables found in table area {table_idx + 1}" ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
f"No tables found in table area {table_idx + 1}"
)
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
) )
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] outer_text = [
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol) t
inner_text = [] for direction in self.t_bbox
for i in range(1, len(cols)): for t in self.t_bbox[direction]
left = cols[i - 1][1] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
right = cols[i][0] ]
inner_text.extend( inner_text.extend(outer_text)
[ cols = self._add_columns(cols, inner_text, self.row_tol)
t cols = self._join_columns(cols, text_x_min, text_x_max)
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -160,8 +160,8 @@ def test_cli_output_format():
def test_cli_quiet(): def test_cli_quiet():
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "blank.pdf") infile = os.path.join(testdir, "empty.pdf")
outfile = os.path.join(tempdir, "blank.csv") outfile = os.path.join(tempdir, "empty.csv")
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(

View File

@ -55,15 +55,33 @@ def test_image_warning():
) )
def test_no_tables_found(): def test_lattice_no_tables_on_page():
filename = os.path.join(testdir, "blank.pdf") filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("error") warnings.simplefilter("error")
with pytest.raises(UserWarning) as e: with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename, flavor="lattice")
assert str(e.value) == "No tables found on page-1" assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_in_area():
filename = os.path.join(testdir, "only_page_number.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found in table area 1"
def test_no_tables_found_logs_suppressed(): def test_no_tables_found_logs_suppressed():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
with warnings.catch_warnings(): with warnings.catch_warnings():
@ -77,7 +95,7 @@ def test_no_tables_found_logs_suppressed():
def test_no_tables_found_warnings_suppressed(): def test_no_tables_found_warnings_suppressed():
filename = os.path.join(testdir, "blank.pdf") filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings(): with warnings.catch_warnings():
# the test should fail if any warning is thrown # the test should fail if any warning is thrown
warnings.simplefilter("error") warnings.simplefilter("error")