Hybrid parser fixes

Improve parser comparison notebook to flag identical parses, display
multiple tables correctly
Fix tolerance parameter inclusion for hybrid.
pull/153/head
Frh 2020-05-04 18:52:11 -07:00
parent 7fae107560
commit 63adfd5468
3 changed files with 250 additions and 46 deletions

View File

@ -179,7 +179,9 @@ class Hybrid(BaseParser):
self.table_bbox_parses[lattice_bbox] = self.lattice_parser self.table_bbox_parses[lattice_bbox] = self.lattice_parser
else: else:
network_cols_boundaries = self._augment_boundaries_with_splits( network_cols_boundaries = self._augment_boundaries_with_splits(
network_cols_boundaries, lattice_cols) # self.column_tol??? network_cols_boundaries,
lattice_cols,
self.lattice_parser.joint_tol)
augmented_bbox = ( augmented_bbox = (
network_cols_boundaries[0][0], network_bbox[1], network_cols_boundaries[0][0], network_bbox[1],
network_cols_boundaries[-1][1], network_bbox[3], network_cols_boundaries[-1][1], network_bbox[3],

File diff suppressed because one or more lines are too long

View File

@ -299,6 +299,17 @@ def test_hybrid():
tables = camelot.read_pdf(filename, flavor="hybrid") tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_vertical_header(): def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header. """Tests a complex table with a vertically text header.
""" """