Update tests with new API
* Update Lattice tests with new API * Update Stream tests with new API, fix CLI * Add table_area test, Stream fixespull/2/head
parent
a94c350a7b
commit
439059817d
|
|
@ -77,7 +77,7 @@ def _merge_columns(l, mtol=0):
|
||||||
merged.append(higher)
|
merged.append(higher)
|
||||||
else:
|
else:
|
||||||
lower = merged[-1]
|
lower = merged[-1]
|
||||||
if mtol > 0:
|
if mtol >= 0:
|
||||||
if (higher[0] <= lower[1] or
|
if (higher[0] <= lower[1] or
|
||||||
np.isclose(higher[0], lower[1], atol=mtol)):
|
np.isclose(higher[0], lower[1], atol=mtol)):
|
||||||
upper_bound = max(lower[1], higher[1])
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
|
@ -306,8 +306,8 @@ class Stream:
|
||||||
for i in range(1, len(cols)):
|
for i in range(1, len(cols)):
|
||||||
left = cols[i - 1][1]
|
left = cols[i - 1][1]
|
||||||
right = cols[i][0]
|
right = cols[i][0]
|
||||||
inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
|
inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right])
|
||||||
outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
|
||||||
inner_text.extend(outer_text)
|
inner_text.extend(outer_text)
|
||||||
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
cols = _add_columns(cols, inner_text, self.ytol[table_no])
|
||||||
cols = _join_columns(cols, k[0], k[2])
|
cols = _join_columns(cols, k[0], k[2])
|
||||||
|
|
|
||||||
|
|
@ -23,10 +23,10 @@ def test_lattice_basic():
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir,
|
pdfname = os.path.join(testdir,
|
||||||
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
||||||
extractor = Lattice(Pdf(pdfname,
|
manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}],
|
||||||
pagenos=[{'start': 2, 'end': 2}], clean=True))
|
clean=True)
|
||||||
tables = extractor.get_tables()
|
tables = manager.extract()
|
||||||
assert_equal(tables['page-2'][0], data)
|
assert_equal(tables['page-2']['table-1']['data'], data)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_fill():
|
def test_lattice_fill():
|
||||||
|
|
@ -74,9 +74,9 @@ def test_lattice_fill():
|
||||||
["Source: Data Warehouse 12/14/15","","",""]
|
["Source: Data Warehouse 12/14/15","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'row_span_1.pdf')
|
pdfname = os.path.join(testdir, 'row_span_1.pdf')
|
||||||
extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
|
manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True)
|
||||||
tables = extractor.get_tables()
|
tables = manager.extract()
|
||||||
assert_equal(tables['pagea-1'][0], data)
|
assert_equal(tables['page-1']['table-1']['data'], data)
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_invert():
|
def test_lattice_invert():
|
||||||
|
|
@ -92,6 +92,6 @@ def test_lattice_invert():
|
||||||
["Total","","47","92","11.81","22,455","19,584","10,644"]
|
["Total","","47","92","11.81","22,455","19,584","10,644"]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
|
||||||
extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
|
manager = Pdf(Lattice(invert=True), pdfname, clean=True)
|
||||||
tables = extractor.get_tables()
|
tables = manager.extract()
|
||||||
assert_equal(tables['page-1'][1], data)
|
assert_equal(tables['page-1']['table-2']['data'], data)
|
||||||
|
|
@ -13,59 +13,30 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
def test_stream_basic():
|
def test_stream_basic():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
|
["", "Table 6.", ""],
|
||||||
["Entidad","","Municipio","","Localidad",""],
|
["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
|
["", "at Producer Level, 1994/95 Estimates.", ""],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
|
["", "Active Ingredient", "Sales Value"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
|
["", "(in billions of lbs.)", "(in billions of dollars)"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
|
["Category", "1994/95", "1994/95"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
|
["U.S. Production", "1.3", "7.0"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
|
["U.S. Imports", "0.2", "2.2"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
|
["Total Supply", "1.5", "9.2"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
|
["U.S. Exports", "0.5", "2.6"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
|
["Net Supply/Usage", "1.0", "6.6"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
|
["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
|
["sources.", "", ""],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
|
["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""]
|
||||||
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
|
||||||
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
|
||||||
]
|
]
|
||||||
|
|
||||||
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf")
|
||||||
extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
|
manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}],
|
||||||
clean=True))
|
clean=True)
|
||||||
tables = extractor.get_tables()
|
tables = manager.extract()
|
||||||
assert_equal(tables['page-1'][0], data)
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_ncolumns():
|
def test_stream_missing_value():
|
||||||
|
|
||||||
data = [
|
data = [
|
||||||
["Bhandara - Key Indicators","","","",""],
|
["Bhandara - Key Indicators","","","",""],
|
||||||
|
|
@ -110,14 +81,36 @@ def test_stream_ncolumns():
|
||||||
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
|
||||||
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
|
||||||
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
|
||||||
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
|
["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""],
|
||||||
["","4","","",""]
|
["4","","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'missing_values.pdf')
|
pdfname = os.path.join(testdir, "missing_values.pdf")
|
||||||
extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
|
manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True)
|
||||||
ncolumns=5)
|
tables = manager.extract()
|
||||||
tables = extractor.get_tables()
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
assert_equal(tables['page-1'][0], data)
|
|
||||||
|
|
||||||
|
def test_stream_single_table_area():
|
||||||
|
|
||||||
|
data = [
|
||||||
|
["","One Withholding"],
|
||||||
|
["Payroll Period","Allowance"],
|
||||||
|
["Weekly","$71.15"],
|
||||||
|
["Biweekly","142.31"],
|
||||||
|
["Semimonthly","154.17"],
|
||||||
|
["Monthly","308.33"],
|
||||||
|
["Quarterly","925.00"],
|
||||||
|
["Semiannually","1,850.00"],
|
||||||
|
["Annually","3,700.00"],
|
||||||
|
["Daily or Miscellaneous","14.23"],
|
||||||
|
["(each day of the payroll period)",""]
|
||||||
|
]
|
||||||
|
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf")
|
||||||
|
manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10],
|
||||||
|
margins=(1.0, 0.5, 0.1)),
|
||||||
|
pdfname, pagenos=[{"start": 1, "end": 1}], clean=True)
|
||||||
|
tables = manager.extract()
|
||||||
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_columns():
|
def test_stream_columns():
|
||||||
|
|
@ -167,8 +160,8 @@ def test_stream_columns():
|
||||||
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
|
||||||
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, 'mexican_towns.pdf')
|
pdfname = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
extractor = Stream(Pdf(pdfname, clean=True),
|
manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname,
|
||||||
columns='28,67,180,230,425,475,700')
|
clean=True)
|
||||||
tables = extractor.get_tables()
|
tables = manager.extract()
|
||||||
assert_equal(tables['page-1'][0], data)
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
|
|
@ -339,14 +339,13 @@ if __name__ == '__main__':
|
||||||
float(args['--wmargin']))
|
float(args['--wmargin']))
|
||||||
if args['<method>'] == 'lattice':
|
if args['<method>'] == 'lattice':
|
||||||
try:
|
try:
|
||||||
manager = Pdf(Lattice(
|
tarea = args['--tarea'] if args['--tarea'] else None
|
||||||
table_area=args['--tarea'],
|
fill = args['--fill'] if args['--fill'] else None
|
||||||
fill=args['--fill'],
|
jtol = [int(j) for j in args['--jtol']]
|
||||||
jtol=[int(j) for j in args['--jtol']],
|
mtol = [int(m) for m in args['--mtol']]
|
||||||
mtol=[int(m) for m in args['--mtol']],
|
manager = Pdf(Lattice(table_area=tarea, fill=fill, jtol=jtol,
|
||||||
scale=int(args['--scale']),
|
mtol=mtol, scale=int(args['--scale']),
|
||||||
invert=args['--invert'],
|
invert=args['--invert'], margins=margins,
|
||||||
margins=margins,
|
|
||||||
debug=args['--debug']),
|
debug=args['--debug']),
|
||||||
filename,
|
filename,
|
||||||
pagenos=p,
|
pagenos=p,
|
||||||
|
|
@ -406,14 +405,17 @@ if __name__ == '__main__':
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
try:
|
try:
|
||||||
manager = Pdf(Stream(
|
tarea = args['--tarea'] if args['--tarea'] else None
|
||||||
table_area=args['--tarea'],
|
columns = args['--columns'] if args['--columns'] else None
|
||||||
columns=args['--columns'],
|
if args['--ncols']:
|
||||||
ncolumns=[int(nc) for nc in args['--ncols']],
|
ncolumns = [int(nc) for nc in args['--ncols']]
|
||||||
ytol=[int(y) for y in args['--ytol']],
|
else:
|
||||||
mtol=[int(m) for m in args['--mtol']],
|
ncolumns = None
|
||||||
margins=margins,
|
ytol = [int(y) for y in args['--ytol']]
|
||||||
debug=args['--debug']),
|
mtol = [int(m) for m in args['--mtol']]
|
||||||
|
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
||||||
|
ncolumns=ncolumns, ytol=ytol, mtol=mtol,
|
||||||
|
margins=margins, debug=args['--debug']),
|
||||||
filename,
|
filename,
|
||||||
pagenos=p,
|
pagenos=p,
|
||||||
parallel=args['--parallel'],
|
parallel=args['--parallel'],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue