Update tests with new API

* Update Lattice tests with new API

* Update Stream tests with new API, fix CLI

* Add table_area test, Stream fixes
pull/2/head
Vinayak Mehta 2016-09-09 16:56:25 +05:30 committed by GitHub
parent a94c350a7b
commit 439059817d
4 changed files with 85 additions and 90 deletions

View File

@ -77,7 +77,7 @@ def _merge_columns(l, mtol=0):
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if mtol > 0: if mtol >= 0:
if (higher[0] <= lower[1] or if (higher[0] <= lower[1] or
np.isclose(higher[0], lower[1], atol=mtol)): np.isclose(higher[0], lower[1], atol=mtol)):
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
@ -306,8 +306,8 @@ class Stream:
for i in range(1, len(cols)): for i in range(1, len(cols)):
left = cols[i - 1][1] left = cols[i - 1][1]
right = cols[i][0] right = cols[i][0]
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right]) inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
inner_text.extend(outer_text) inner_text.extend(outer_text)
cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _add_columns(cols, inner_text, self.ytol[table_no])
cols = _join_columns(cols, k[0], k[2]) cols = _join_columns(cols, k[0], k[2])

View File

@ -23,10 +23,10 @@ def test_lattice_basic():
] ]
pdfname = os.path.join(testdir, pdfname = os.path.join(testdir,
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf") "tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf")
extractor = Lattice(Pdf(pdfname, manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}],
pagenos=[{'start': 2, 'end': 2}], clean=True)) clean=True)
tables = extractor.get_tables() tables = manager.extract()
assert_equal(tables['page-2'][0], data) assert_equal(tables['page-2']['table-1']['data'], data)
def test_lattice_fill(): def test_lattice_fill():
@ -74,9 +74,9 @@ def test_lattice_fill():
["Source: Data Warehouse 12/14/15","","",""] ["Source: Data Warehouse 12/14/15","","",""]
] ]
pdfname = os.path.join(testdir, 'row_span_1.pdf') pdfname = os.path.join(testdir, 'row_span_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40) manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True)
tables = extractor.get_tables() tables = manager.extract()
assert_equal(tables['pagea-1'][0], data) assert_equal(tables['page-1']['table-1']['data'], data)
def test_lattice_invert(): def test_lattice_invert():
@ -92,6 +92,6 @@ def test_lattice_invert():
["Total","","47","92","11.81","22,455","19,584","10,644"] ["Total","","47","92","11.81","22,455","19,584","10,644"]
] ]
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
extractor = Lattice(Pdf(pdfname, clean=True), invert=True) manager = Pdf(Lattice(invert=True), pdfname, clean=True)
tables = extractor.get_tables() tables = manager.extract()
assert_equal(tables['page-1'][1], data) assert_equal(tables['page-1']['table-2']['data'], data)

View File

@ -13,59 +13,30 @@ testdir = os.path.dirname(os.path.abspath(__file__))
def test_stream_basic(): def test_stream_basic():
data = [ data = [
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], ["", "Table 6.", ""],
["Entidad","","Municipio","","Localidad",""], ["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], ["", "at Producer Level, 1994/95 Estimates.", ""],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], ["", "Active Ingredient", "Sales Value"],
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], ["", "(in billions of lbs.)", "(in billions of dollars)"],
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], ["Category", "1994/95", "1994/95"],
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], ["U.S. Production", "1.3", "7.0"],
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], ["U.S. Imports", "0.2", "2.2"],
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], ["Total Supply", "1.5", "9.2"],
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], ["U.S. Exports", "0.5", "2.6"],
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], ["Net Supply/Usage", "1.0", "6.6"],
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], ["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""],
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], ["sources.", "", ""],
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], ["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""]
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
] ]
pdfname = os.path.join(testdir, 'mexican_towns.pdf') pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf")
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}], manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}],
clean=True)) clean=True)
tables = extractor.get_tables() tables = manager.extract()
assert_equal(tables['page-1'][0], data) assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_ncolumns(): def test_stream_missing_value():
data = [ data = [
["Bhandara - Key Indicators","","","",""], ["Bhandara - Key Indicators","","","",""],
@ -110,14 +81,36 @@ def test_stream_ncolumns():
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""],
["","4","","",""] ["4","","","",""]
] ]
pdfname = os.path.join(testdir, 'missing_values.pdf') pdfname = os.path.join(testdir, "missing_values.pdf")
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True), manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True)
ncolumns=5) tables = manager.extract()
tables = extractor.get_tables() assert_equal(tables["page-1"]["table-1"]["data"], data)
assert_equal(tables['page-1'][0], data)
def test_stream_single_table_area():
data = [
["","One Withholding"],
["Payroll Period","Allowance"],
["Weekly","$71.15"],
["Biweekly","142.31"],
["Semimonthly","154.17"],
["Monthly","308.33"],
["Quarterly","925.00"],
["Semiannually","1,850.00"],
["Annually","3,700.00"],
["Daily or Miscellaneous","14.23"],
["(each day of the payroll period)",""]
]
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf")
manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10],
margins=(1.0, 0.5, 0.1)),
pdfname, pagenos=[{"start": 1, "end": 1}], clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_columns(): def test_stream_columns():
@ -167,8 +160,8 @@ def test_stream_columns():
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
] ]
pdfname = os.path.join(testdir, 'mexican_towns.pdf') pdfname = os.path.join(testdir, "mexican_towns.pdf")
extractor = Stream(Pdf(pdfname, clean=True), manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname,
columns='28,67,180,230,425,475,700') clean=True)
tables = extractor.get_tables() tables = manager.extract()
assert_equal(tables['page-1'][0], data) assert_equal(tables["page-1"]["table-1"]["data"], data)

View File

@ -339,14 +339,13 @@ if __name__ == '__main__':
float(args['--wmargin'])) float(args['--wmargin']))
if args['<method>'] == 'lattice': if args['<method>'] == 'lattice':
try: try:
manager = Pdf(Lattice( tarea = args['--tarea'] if args['--tarea'] else None
table_area=args['--tarea'], fill = args['--fill'] if args['--fill'] else None
fill=args['--fill'], jtol = [int(j) for j in args['--jtol']]
jtol=[int(j) for j in args['--jtol']], mtol = [int(m) for m in args['--mtol']]
mtol=[int(m) for m in args['--mtol']], manager = Pdf(Lattice(table_area=tarea, fill=fill, jtol=jtol,
scale=int(args['--scale']), mtol=mtol, scale=int(args['--scale']),
invert=args['--invert'], invert=args['--invert'], margins=margins,
margins=margins,
debug=args['--debug']), debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
@ -406,14 +405,17 @@ if __name__ == '__main__':
sys.exit() sys.exit()
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
try: try:
manager = Pdf(Stream( tarea = args['--tarea'] if args['--tarea'] else None
table_area=args['--tarea'], columns = args['--columns'] if args['--columns'] else None
columns=args['--columns'], if args['--ncols']:
ncolumns=[int(nc) for nc in args['--ncols']], ncolumns = [int(nc) for nc in args['--ncols']]
ytol=[int(y) for y in args['--ytol']], else:
mtol=[int(m) for m in args['--mtol']], ncolumns = None
margins=margins, ytol = [int(y) for y in args['--ytol']]
debug=args['--debug']), mtol = [int(m) for m in args['--mtol']]
manager = Pdf(Stream(table_area=tarea, columns=columns,
ncolumns=ncolumns, ytol=ytol, mtol=mtol,
margins=margins, debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,
parallel=args['--parallel'], parallel=args['--parallel'],