diff --git a/camelot/stream.py b/camelot/stream.py index 0effb08..fb06c38 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -77,7 +77,7 @@ def _merge_columns(l, mtol=0): merged.append(higher) else: lower = merged[-1] - if mtol > 0: + if mtol >= 0: if (higher[0] <= lower[1] or np.isclose(higher[0], lower[1], atol=mtol)): upper_bound = max(lower[1], higher[1]) @@ -306,8 +306,8 @@ class Stream: for i in range(1, len(cols)): left = cols[i - 1][1] right = cols[i][0] - inner_text.extend([t for t in text if t.x0 > left and t.x1 < right]) - outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend([t for t in t_bbox if t.x0 > left and t.x1 < right]) + outer_text = [t for t in t_bbox if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) cols = _add_columns(cols, inner_text, self.ytol[table_no]) cols = _join_columns(cols, k[0], k[2]) diff --git a/tests/test_lattice.py b/tests/test_lattice.py index 55e3086..419114b 100644 --- a/tests/test_lattice.py +++ b/tests/test_lattice.py @@ -23,10 +23,10 @@ def test_lattice_basic(): ] pdfname = os.path.join(testdir, "tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf") - extractor = Lattice(Pdf(pdfname, - pagenos=[{'start': 2, 'end': 2}], clean=True)) - tables = extractor.get_tables() - assert_equal(tables['page-2'][0], data) + manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}], + clean=True) + tables = manager.extract() + assert_equal(tables['page-2']['table-1']['data'], data) def test_lattice_fill(): @@ -74,9 +74,9 @@ def test_lattice_fill(): ["Source: Data Warehouse 12/14/15","","",""] ] pdfname = os.path.join(testdir, 'row_span_1.pdf') - extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40) - tables = extractor.get_tables() - assert_equal(tables['pagea-1'][0], data) + manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables['page-1']['table-1']['data'], data) def test_lattice_invert(): @@ -92,6 +92,6 @@ def test_lattice_invert(): ["Total","","47","92","11.81","22,455","19,584","10,644"] ] pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') - extractor = Lattice(Pdf(pdfname, clean=True), invert=True) - tables = extractor.get_tables() - assert_equal(tables['page-1'][1], data) \ No newline at end of file + manager = Pdf(Lattice(invert=True), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables['page-1']['table-2']['data'], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py index 2a3d05e..85eca91 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -13,59 +13,30 @@ testdir = os.path.dirname(os.path.abspath(__file__)) def test_stream_basic(): data = [ - ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], - ["Entidad","","Municipio","","Localidad",""], - ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], - ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], - ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], - ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], - ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], - ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], - ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], - ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], - ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], - ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], - ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], - ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], - ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], - ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], - ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], - ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], - ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], - ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], - ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], - ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], - ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], - ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], - ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], - ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], - ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], - ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], - ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], - ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], - ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], - ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], - ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], - ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], - ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], - ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], - ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], + ["", "Table 6.", ""], + ["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""], + ["", "at Producer Level, 1994/95 Estimates.", ""], + ["", "Active Ingredient", "Sales Value"], + ["", "(in billions of lbs.)", "(in billions of dollars)"], + ["Category", "1994/95", "1994/95"], + ["U.S. Production", "1.3", "7.0"], + ["U.S. Imports", "0.2", "2.2"], + ["Total Supply", "1.5", "9.2"], + ["U.S. Exports", "0.5", "2.6"], + ["Net Supply/Usage", "1.0", "6.6"], + ["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""], + ["sources.", "", ""], + ["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""] ] - pdfname = os.path.join(testdir, 'mexican_towns.pdf') - extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}], - clean=True)) - tables = extractor.get_tables() - assert_equal(tables['page-1'][0], data) + pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf") + manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}], + clean=True) + tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) -def test_stream_ncolumns(): +def test_stream_missing_value(): data = [ ["Bhandara - Key Indicators","","","",""], @@ -110,14 +81,36 @@ def test_stream_ncolumns(): ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], - ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], - ["","4","","",""] + ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""], + ["4","","","",""] ] - pdfname = os.path.join(testdir, 'missing_values.pdf') - extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True), - ncolumns=5) - tables = extractor.get_tables() - assert_equal(tables['page-1'][0], data) + pdfname = os.path.join(testdir, "missing_values.pdf") + manager = Pdf(Stream(margins=(1.0, 0.5, 0.1)), pdfname, clean=True) + tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) + + +def test_stream_single_table_area(): + + data = [ + ["","One Withholding"], + ["Payroll Period","Allowance"], + ["Weekly","$71.15"], + ["Biweekly","142.31"], + ["Semimonthly","154.17"], + ["Monthly","308.33"], + ["Quarterly","925.00"], + ["Semiannually","1,850.00"], + ["Annually","3,700.00"], + ["Daily or Miscellaneous","14.23"], + ["(each day of the payroll period)",""] + ] + pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf") + manager = Pdf(Stream(table_area=["320,500,573,335"], ytol=[10], + margins=(1.0, 0.5, 0.1)), + pdfname, pagenos=[{"start": 1, "end": 1}], clean=True) + tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) def test_stream_columns(): @@ -167,8 +160,8 @@ def test_stream_columns(): ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], ] - pdfname = os.path.join(testdir, 'mexican_towns.pdf') - extractor = Stream(Pdf(pdfname, clean=True), - columns='28,67,180,230,425,475,700') - tables = extractor.get_tables() - assert_equal(tables['page-1'][0], data) \ No newline at end of file + pdfname = os.path.join(testdir, "mexican_towns.pdf") + manager = Pdf(Stream(columns=["28,67,180,230,425,475,700"], ytol=[10]), pdfname, + clean=True) + tables = manager.extract() + assert_equal(tables["page-1"]["table-1"]["data"], data) \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index ee43b65..1565856 100755 --- a/tools/camelot +++ b/tools/camelot @@ -339,14 +339,13 @@ if __name__ == '__main__': float(args['--wmargin'])) if args[''] == 'lattice': try: - manager = Pdf(Lattice( - table_area=args['--tarea'], - fill=args['--fill'], - jtol=[int(j) for j in args['--jtol']], - mtol=[int(m) for m in args['--mtol']], - scale=int(args['--scale']), - invert=args['--invert'], - margins=margins, + tarea = args['--tarea'] if args['--tarea'] else None + fill = args['--fill'] if args['--fill'] else None + jtol = [int(j) for j in args['--jtol']] + mtol = [int(m) for m in args['--mtol']] + manager = Pdf(Lattice(table_area=tarea, fill=fill, jtol=jtol, + mtol=mtol, scale=int(args['--scale']), + invert=args['--invert'], margins=margins, debug=args['--debug']), filename, pagenos=p, @@ -406,14 +405,17 @@ if __name__ == '__main__': sys.exit() elif args[''] == 'stream': try: - manager = Pdf(Stream( - table_area=args['--tarea'], - columns=args['--columns'], - ncolumns=[int(nc) for nc in args['--ncols']], - ytol=[int(y) for y in args['--ytol']], - mtol=[int(m) for m in args['--mtol']], - margins=margins, - debug=args['--debug']), + tarea = args['--tarea'] if args['--tarea'] else None + columns = args['--columns'] if args['--columns'] else None + if args['--ncols']: + ncolumns = [int(nc) for nc in args['--ncols']] + else: + ncolumns = None + ytol = [int(y) for y in args['--ytol']] + mtol = [int(m) for m in args['--mtol']] + manager = Pdf(Stream(table_area=tarea, columns=columns, + ncolumns=ncolumns, ytol=ytol, mtol=mtol, + margins=margins, debug=args['--debug']), filename, pagenos=p, parallel=args['--parallel'],