Prevent taking max of an empty set
parent
9087429501
commit
5d20d56e48
|
|
@ -121,6 +121,7 @@ class Stream(BaseParser):
|
||||||
row_y = 0
|
row_y = 0
|
||||||
rows = []
|
rows = []
|
||||||
temp = []
|
temp = []
|
||||||
|
|
||||||
for t in text:
|
for t in text:
|
||||||
# is checking for upright necessary?
|
# is checking for upright necessary?
|
||||||
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
||||||
|
|
@ -131,8 +132,10 @@ class Stream(BaseParser):
|
||||||
temp = []
|
temp = []
|
||||||
row_y = t.y0
|
row_y = t.y0
|
||||||
temp.append(t)
|
temp.append(t)
|
||||||
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
__ = rows.pop(0) # TODO: hacky
|
if len(rows) > 1:
|
||||||
|
__ = rows.pop(0) # TODO: hacky
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -345,43 +348,46 @@ class Stream(BaseParser):
|
||||||
else:
|
else:
|
||||||
# calculate mode of the list of number of elements in
|
# calculate mode of the list of number of elements in
|
||||||
# each row to guess the number of columns
|
# each row to guess the number of columns
|
||||||
ncols = max(set(elements), key=elements.count)
|
if not len(elements):
|
||||||
if ncols == 1:
|
cols = [(text_x_min, text_x_max)]
|
||||||
# if mode is 1, the page usually contains not tables
|
else:
|
||||||
# but there can be cases where the list can be skewed,
|
ncols = max(set(elements), key=elements.count)
|
||||||
# try to remove all 1s from list in this case and
|
if ncols == 1:
|
||||||
# see if the list contains elements, if yes, then use
|
# if mode is 1, the page usually contains not tables
|
||||||
# the mode after removing 1s
|
# but there can be cases where the list can be skewed,
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
# try to remove all 1s from list in this case and
|
||||||
if len(elements):
|
# see if the list contains elements, if yes, then use
|
||||||
ncols = max(set(elements), key=elements.count)
|
# the mode after removing 1s
|
||||||
else:
|
elements = list(filter(lambda x: x != 1, elements))
|
||||||
warnings.warn(
|
if len(elements):
|
||||||
f"No tables found in table area {table_idx + 1}"
|
ncols = max(set(elements), key=elements.count)
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
f"No tables found in table area {table_idx + 1}"
|
||||||
|
)
|
||||||
|
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
||||||
|
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
||||||
|
inner_text = []
|
||||||
|
for i in range(1, len(cols)):
|
||||||
|
left = cols[i - 1][1]
|
||||||
|
right = cols[i][0]
|
||||||
|
inner_text.extend(
|
||||||
|
[
|
||||||
|
t
|
||||||
|
for direction in self.t_bbox
|
||||||
|
for t in self.t_bbox[direction]
|
||||||
|
if t.x0 > left and t.x1 < right
|
||||||
|
]
|
||||||
)
|
)
|
||||||
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
|
outer_text = [
|
||||||
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
|
t
|
||||||
inner_text = []
|
for direction in self.t_bbox
|
||||||
for i in range(1, len(cols)):
|
for t in self.t_bbox[direction]
|
||||||
left = cols[i - 1][1]
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||||
right = cols[i][0]
|
]
|
||||||
inner_text.extend(
|
inner_text.extend(outer_text)
|
||||||
[
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
t
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > left and t.x1 < right
|
|
||||||
]
|
|
||||||
)
|
|
||||||
outer_text = [
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
|
||||||
]
|
|
||||||
inner_text.extend(outer_text)
|
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
|
||||||
|
|
||||||
return cols, rows
|
return cols, rows
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -160,8 +160,8 @@ def test_cli_output_format():
|
||||||
|
|
||||||
def test_cli_quiet():
|
def test_cli_quiet():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, "blank.pdf")
|
infile = os.path.join(testdir, "empty.pdf")
|
||||||
outfile = os.path.join(tempdir, "blank.csv")
|
outfile = os.path.join(tempdir, "empty.csv")
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
|
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
|
|
|
||||||
|
|
@ -55,15 +55,33 @@ def test_image_warning():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found():
|
def test_lattice_no_tables_on_page():
|
||||||
filename = os.path.join(testdir, "blank.pdf")
|
filename = os.path.join(testdir, "empty.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
with pytest.raises(UserWarning) as e:
|
with pytest.raises(UserWarning) as e:
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename, flavor="lattice")
|
||||||
assert str(e.value) == "No tables found on page-1"
|
assert str(e.value) == "No tables found on page-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_no_tables_on_page():
|
||||||
|
filename = os.path.join(testdir, "empty.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert str(e.value) == "No tables found on page-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_no_tables_in_area():
|
||||||
|
filename = os.path.join(testdir, "only_page_number.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert str(e.value) == "No tables found in table area 1"
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_logs_suppressed():
|
def test_no_tables_found_logs_suppressed():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
@ -77,7 +95,7 @@ def test_no_tables_found_logs_suppressed():
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_warnings_suppressed():
|
def test_no_tables_found_warnings_suppressed():
|
||||||
filename = os.path.join(testdir, "blank.pdf")
|
filename = os.path.join(testdir, "empty.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
# the test should fail if any warning is thrown
|
# the test should fail if any warning is thrown
|
||||||
warnings.simplefilter("error")
|
warnings.simplefilter("error")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue