diff --git a/.gitignore b/.gitignore index 14fc340..5351894 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ build/ dist/ *.egg-info/ .coverage + +.pytest_cache/ diff --git a/camelot/core.py b/camelot/core.py index e3f9bb3..2417291 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -29,7 +29,7 @@ class Cell(object): self._text = '' def __repr__(self): - return ''.format( self.x1, self.y1, self.x2, self.y2) @property @@ -80,7 +80,7 @@ class Table(object): self.page = None def __repr__(self): - return '<{} shape={}>'.format(self.__class__.__name__, self._shape) + return '<{} shape={}>'.format(self.__class__.__name__, self.shape) @property def data(self): diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index ba79230..15663dc 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -25,8 +25,8 @@ class Lattice(BaseParser): """ def __init__(self, table_area=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], - split_text=False, flag_size=True, line_close_tol=2, - joint_close_tol=2, blocksize=15, threshold_constant=-2, + split_text=False, flag_size=False, line_close_tol=2, + joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, margins=(1.0, 0.5, 0.1), debug=None): self.table_area = table_area self.process_background = process_background @@ -37,7 +37,7 @@ class Lattice(BaseParser): self.flag_size = flag_size self.line_close_tol = line_close_tol self.joint_close_tol = joint_close_tol - self.blocksize = blocksize + self.threshold_blocksize = threshold_blocksize self.threshold_constant = threshold_constant self.iterations = iterations self.char_margin, self.line_margin, self.word_margin = margins @@ -98,7 +98,7 @@ class Lattice(BaseParser): def _generate_table_bbox(self): self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background, - blocksize=self.blocksize, c=self.threshold_constant) + blocksize=self.threshold_blocksize, c=self.threshold_constant) image_width = self.image.shape[1] image_height = self.image.shape[0] image_width_scaler = image_width / float(self.pdf_width) @@ -173,10 +173,10 @@ class Lattice(BaseParser): table = Table(cols, rows) # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) - # set spanning cells to True - table = table.set_span() # set table border edges to True table = table.set_border() + # set spanning cells to True + table = table.set_span() pos_errors = [] for direction in self.t_bbox: diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 12f4b6b..e1a4980 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -19,7 +19,7 @@ class Stream(BaseParser): """ def __init__(self, table_area=None, columns=None, split_text=False, - flag_size=True, row_close_tol=2, col_close_tol=0, + flag_size=False, row_close_tol=2, col_close_tol=0, margins=(1.0, 0.5, 0.1), debug=None): self.table_area = table_area self.columns = columns diff --git a/setup.py b/setup.py index 1d1d4ec..b496b51 100644 --- a/setup.py +++ b/setup.py @@ -48,10 +48,8 @@ def setup_package(): author=AUTHOR, author_email=AUTHOR_EMAIL, license=LICENSE, - keywords='parse scrape pdf table', packages=['camelot'], - install_requires=reqs, - scripts=['tools/camelot']) + install_requires=reqs) try: from setuptools import setup diff --git a/tests/files/left_rotated_table_1.pdf b/tests/files/anticlockwise_table_1.pdf similarity index 100% rename from tests/files/left_rotated_table_1.pdf rename to tests/files/anticlockwise_table_1.pdf diff --git a/tests/files/left_rotated_table_2.pdf b/tests/files/anticlockwise_table_2.pdf similarity index 100% rename from tests/files/left_rotated_table_2.pdf rename to tests/files/anticlockwise_table_2.pdf diff --git a/tests/files/lines_in_background_1.pdf b/tests/files/background_lines_1.pdf similarity index 100% rename from tests/files/lines_in_background_1.pdf rename to tests/files/background_lines_1.pdf diff --git a/tests/files/lines_in_background_2.pdf b/tests/files/background_lines_2.pdf similarity index 100% rename from tests/files/lines_in_background_2.pdf rename to tests/files/background_lines_2.pdf diff --git a/tests/files/right_rotated_table_1.pdf b/tests/files/clockwise_table_1.pdf similarity index 100% rename from tests/files/right_rotated_table_1.pdf rename to tests/files/clockwise_table_1.pdf diff --git a/tests/files/right_rotated_table_2.pdf b/tests/files/clockwise_table_2.pdf similarity index 100% rename from tests/files/right_rotated_table_2.pdf rename to tests/files/clockwise_table_2.pdf diff --git a/tests/test_camelot.py b/tests/test_camelot.py deleted file mode 100644 index 14253a8..0000000 --- a/tests/test_camelot.py +++ /dev/null @@ -1,334 +0,0 @@ -import os - -from nose.tools import assert_equal - -from camelot.pdf import Pdf -from camelot.lattice import Lattice - - -testdir = os.path.dirname(os.path.abspath(__file__)) - - -def test_stream_basic(): - - data = [ - ["", "Table 6.", ""], - ["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""], - ["", "at Producer Level, 1994/95 Estimates.", ""], - ["", "Active Ingredient", "Sales Value"], - ["", "(in billions of lbs.)", "(in billions of dollars)"], - ["Category", "1994/95", "1994/95"], - ["U.S. Production", "1.3", "7.0"], - ["U.S. Imports", "0.2", "2.2"], - ["Total Supply", "1.5", "9.2"], - ["U.S. Exports", "0.5", "2.6"], - ["Net Supply/Usage", "1.0", "6.6"], - ["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""], - ["sources.", "", ""], - ["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""] - ] - - pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf") - manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}], - clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_missing_value(): - - data = [ - ["Bhandara - Key Indicators","","","",""], - ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""], - ["Indicators","TOTAL","RURAL","TOTAL","RURAL"], - ["Reported Prevalence of Morbidity","","","",""], - ["Any Injury .....................................................................................................................................","1.9","2.1","",""], - ["Acute Illness .................................................................................................................................","4.5","5.6","",""], - ["Chronic Illness ..............................................................................................................................","5.1","4.1","",""], - ["Reported Prevalence of Chronic Illness during last one year (%)","","","",""], - ["Disease of respiratory system ......................................................................................................","11.7","15.0","",""], - ["Disease of cardiovascular system ................................................................................................","8.9","9.3","",""], - ["Persons suffering from tuberculosis .............................................................................................","2.2","1.5","",""], - ["Anaemia Status by Haemoglobin Level14 (%)","","","",""], - ["Children (6-59 months) having anaemia ......................................................................................","68.5","71.9","",""], - ["Children (6-59 months) having severe anaemia ..........................................................................","6.7","9.4","",""], - ["Children (6-9 Years) having anaemia - Male ................................................................................","67.1","71.4","",""], - ["Children (6-9 Years) having severe anaemia - Male ....................................................................","4.4","2.4","",""], - ["Children (6-9 Years) having anaemia - Female ...........................................................................","52.4","48.8","",""], - ["Children (6-9 Years) having severe anaemia - Female ................................................................","1.2","0.0","",""], - ["Children (6-14 years) having anaemia - Male .............................................................................","50.8","62.5","",""], - ["Children (6-14 years) having severe anaemia - Male ..................................................................","3.7","3.6","",""], - ["Children (6-14 years) having anaemia - Female .........................................................................","48.3","50.0","",""], - ["Children (6-14 years) having severe anaemia - Female ..............................................................","4.3","6.1","",""], - ["Children (10-19 Years15) having anaemia - Male .........................................................................","37.9","51.2","",""], - ["Children (10-19 Years15) having severe anaemia - Male .............................................................","3.5","4.0","",""], - ["Children (10-19 Years15) having anaemia - Female .....................................................................","46.6","52.1","",""], - ["Children (10-19 Years15) having severe anaemia - Female .........................................................","6.4","6.5","",""], - ["Adolescents (15-19 years) having anaemia ................................................................................","39.4","46.5","",""], - ["Adolescents (15-19 years) having severe anaemia .....................................................................","5.4","5.1","",""], - ["Pregnant women (15-49 aged) having anaemia ..........................................................................","48.8","51.5","",""], - ["Pregnant women (15-49 aged) having severe anaemia ..............................................................","7.1","8.8","",""], - ["Women (15-49 aged) having anaemia .........................................................................................","45.2","51.7","",""], - ["Women (15-49 aged) having severe anaemia .............................................................................","4.8","5.9","",""], - ["Persons (20 years and above) having anaemia ...........................................................................","37.8","42.1","",""], - ["Persons (20 years and above) having Severe anaemia ..............................................................","4.6","4.8","",""], - ["Blood Sugar Level (age 18 years and above) (%)","","","",""], - ["Blood Sugar Level >140 mg/dl (high) ...........................................................................................","12.9","11.1","",""], - ["Blood Sugar Level >160 mg/dl (very high) ...................................................................................","7.0","5.1","",""], - ["Hypertension (age 18 years and above) (%)","","","",""], - ["Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................","23.8","22.8","",""], - ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], - ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], - ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], - ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], - ["4","","","",""] - ] - pdfname = os.path.join(testdir, "missing_values.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_single_table_area(): - - data = [ - ["","One Withholding"], - ["Payroll Period","Allowance"], - ["Weekly","$71.15"], - ["Biweekly","142.31"], - ["Semimonthly","154.17"], - ["Monthly","308.33"], - ["Quarterly","925.00"], - ["Semiannually","1,850.00"], - ["Annually","3,700.00"], - ["Daily or Miscellaneous","14.23"], - ["(each day of the payroll period)",""] - ] - pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf") - manager = Pdf(Stream(table_area=["320,500,573,335"]), - pdfname, pagenos=[{"start": 1, "end": 1}], clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_columns(): - - data = [ - ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], - ["Entidad","","Municipio","","Localidad",""], - ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], - ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], - ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], - ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], - ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], - ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], - ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], - ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], - ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], - ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], - ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], - ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], - ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], - ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], - ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], - ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], - ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], - ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], - ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], - ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], - ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], - ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], - ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], - ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], - ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], - ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], - ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], - ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], - ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], - ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], - ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], - ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], - ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], - ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], - ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], - ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], - ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], - ] - pdfname = os.path.join(testdir, "mexican_towns.pdf") - manager = Pdf(Stream(columns=["67,180,230,425,475"], ytol=[10]), pdfname, - clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_stream_table_rotation(): - - data = [ - ["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""], - ["","","","","","","Modern method","","","","","","","Traditional method","","","",""], - ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"], - ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], - ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], - ["","Caste/tribe","","","","","","","","","","","","","","","",""], - ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], - ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], - ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], - ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], - ["","Wealth index","","","","","","","","","","","","","","","",""], - ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], - ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], - ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], - ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], - ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], - ["","Number of living children","","","","","","","","","","","","","","","",""], - ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], - ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], - ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], - ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], - ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], - ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], - ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], - ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], - ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], - ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], - ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], - ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], - ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], - ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], - ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], - ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], - ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], - ["","not shown separately.","","","","","","","","","","","","","","","",""], - ["","na = Not available","","","","","","","","","","","","","","","",""], - ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], - ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], - ["","","","","","","","","54","","","","","","","","",""] - ] - pdfname = os.path.join(testdir, "left_rotated_table_2.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - pdfname = os.path.join(testdir, "right_rotated_table_2.pdf") - manager = Pdf(Stream(flag_size=False), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables["page-1"]["table-1"]["data"], data) - - -def test_lattice_basic(): - - data = [ - ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], - ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], - ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], - ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], - ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], - ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], - ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] - ] - pdfname = os.path.join(testdir, - "tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf") - manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}], - clean=True) - tables = manager.extract() - assert_equal(tables['page-2']['table-1']['data'], data) - - -def test_lattice_fill(): - - data = [ - ["Plan Type","County","Plan Name","Totals"], - ["GMC","Sacramento","Anthem Blue Cross","164,380"], - ["GMC","Sacramento","Health Net","126,547"], - ["GMC","Sacramento","Kaiser Foundation","74,620"], - ["GMC","Sacramento","Molina Healthcare","59,989"], - ["GMC","San Diego","Care 1st Health Plan","71,831"], - ["GMC","San Diego","Community Health Group","264,639"], - ["GMC","San Diego","Health Net","72,404"], - ["GMC","San Diego","Kaiser","50,415"], - ["GMC","San Diego","Molina Healthcare","206,430"], - ["GMC","Total GMC Enrollment","","1,091,255"], - ["COHS","Marin","Partnership Health Plan of CA","36,006"], - ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], - ["COHS","Napa","Partnership Health Plan of CA","28,398"], - ["COHS","Solano","Partnership Health Plan of CA","113,220"], - ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], - ["COHS","Yolo","Partnership Health Plan of CA","52,674"], - ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], - ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], - ["COHS","Lake","Partnership Health Plan of CA","29,149"], - ["COHS","Lassen","Partnership Health Plan of CA","7,360"], - ["COHS","Modoc","Partnership Health Plan of CA","2,940"], - ["COHS","Shasta","Partnership Health Plan of CA","61,763"], - ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], - ["COHS","Trinity","Partnership Health Plan of CA","4,542"], - ["COHS","Merced","Central California Alliance for Health","123,907"], - ["COHS","Monterey","Central California Alliance for Health","147,397"], - ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], - ["COHS","Santa Barbara","CenCal","117,609"], - ["COHS","San Luis Obispo","CenCal","55,761"], - ["COHS","Orange","CalOptima","783,079"], - ["COHS","San Mateo","Health Plan of San Mateo","113,202"], - ["COHS","Ventura","Gold Coast Health Plan","202,217"], - ["COHS","Total COHS Enrollment","","2,176,064"], - ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], - ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], - ["PCCM","San Francisco","Family Mosaic","25"], - ["PCCM","Total PHP Enrollment","","853"], - ["All Models Total Enrollments","","","10,132,875"], - ["Source: Data Warehouse 12/14/15","","",""] - ] - pdfname = os.path.join(testdir, 'row_span_1.pdf') - manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) - - -def test_lattice_invert(): - - data = [ - ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], - ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], - ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], - ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], - ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], - ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], - ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], - ["Total","","47","92","11.81","22,455","19,584","10,644"] - ] - pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') - manager = Pdf(Lattice(invert=True), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-2']['data'], data) - - -def test_lattice_table_rotation(): - - data = [ - ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], - ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], - ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], - ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], - ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], - ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], - ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], - ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], - ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], - ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], - ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], - ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], - ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] - ] - pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf') - manager = Pdf(Lattice(), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) - - pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf') - manager = Pdf(Lattice(), pdfname, clean=True) - tables = manager.extract() - assert_equal(tables['page-1']['table-1']['data'], data) \ No newline at end of file diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000..52f966a --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,80 @@ +import os + +import pandas as pd + +import camelot + +from test_data import * + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + + +def test_stream(): + pass + + +def test_stream_table_rotated(): + df = pd.DataFrame(data_stream_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_2.pdf") + tables = camelot.read_pdf(filename) + assert df.equals(tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_2.pdf") + tables = camelot.read_pdf(filename) + assert df.equals(tables[0].df) + + +def test_stream_table_area(): + df = pd.DataFrame(data_stream_table_area_single) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf(filename, table_area=["320,500,573,335"]) + assert df.equals(tables[0].df) + + +def test_stream_columns(): + df = pd.DataFrame(data_stream_columns) + + filename = os.path.join(testdir, "mexican_towns.pdf") + tables = camelot.read_pdf( + filename, columns=["67,180,230,425,475"], row_close_tol=10) + assert df.equals(tables[0].df) + + +def test_lattice(): + df = pd.DataFrame(data_lattice) + + filename = os.path.join(testdir, + "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf") + tables = camelot.read_pdf(filename, pages="2", mesh=True) + assert df.equals(tables[0].df) + + +def test_lattice_table_rotated(): + df = pd.DataFrame(data_lattice_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_1.pdf") + tables = camelot.read_pdf(filename, mesh=True) + assert df.equals(tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_1.pdf") + tables = camelot.read_pdf(filename, mesh=True) + assert df.equals(tables[0].df) + + +def test_lattice_process_background(): + df = pd.DataFrame(data_lattice_process_background) + + filename = os.path.join(testdir, "background_lines_1.pdf") + tables = camelot.read_pdf(filename, mesh=True, process_background=True) + assert df.equals(tables[1].df) + + +def test_lattice_copy_text(): + df = pd.DataFrame(data_lattice_copy_text) + + filename = os.path.join(testdir, "row_span_1.pdf") + tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v") + assert df.equals(tables[0].df) \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..3582937 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +data_stream_table_rotated = [ + ["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""], + ["","","","","","","Modern method","","","","","","","Traditional method","","","",""], + ["","","","Any","","","","","","","Other","Any","","","","Not","","Number"], + ["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"], + ["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"], + ["","Caste/tribe","","","","","","","","","","","","","","","",""], + ["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"], + ["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"], + ["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"], + ["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"], + ["","Wealth index","","","","","","","","","","","","","","","",""], + ["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"], + ["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"], + ["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"], + ["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"], + ["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"], + ["","Number of living children","","","","","","","","","","","","","","","",""], + ["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"], + ["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"], + ["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"], + ["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"], + ["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"], + ["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"], + ["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"], + ["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"], + ["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"], + ["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"], + ["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"], + ["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"], + ["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"], + ["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"], + ["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"], + ["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"], + ["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""], + ["","not shown separately.","","","","","","","","","","","","","","","",""], + ["","na = Not available","","","","","","","","","","","","","","","",""], + ["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""], + ["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""], + ["","","","","","","","","54","","","","","","","","",""] +] + + +data_stream_table_area_single = [ + ["","One Withholding"], + ["Payroll Period","Allowance"], + ["Weekly","$71.15"], + ["Biweekly","142.31"], + ["Semimonthly","154.17"], + ["Monthly","308.33"], + ["Quarterly","925.00"], + ["Semiannually","1,850.00"], + ["Annually","3,700.00"], + ["Daily or Miscellaneous","14.23"], + ["(each day of the payroll period)",""] +] + + +data_stream_columns = [ + ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], + ["Entidad","","Municipio","","Localidad",""], + ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], + ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], + ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], + ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], + ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], + ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], + ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], + ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], + ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], + ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], + ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], + ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], + ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], + ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], + ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], + ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], + ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], + ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], + ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], + ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], + ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], + ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], + ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], + ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], + ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], + ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], + ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], + ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], + ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], + ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], + ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], + ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], + ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], + ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], + ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], +] + + +data_lattice = [ + ["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""], + ["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"], + ["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"], + ["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"], + ["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"], + ["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"], + ["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"] +] + + +data_lattice_table_rotated = [ + ["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""], + ["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"], + ["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"], + ["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"], + ["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"], + ["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"], + ["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"], + ["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"], + ["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"], + ["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"], + ["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"], + ["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"], + ["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"] +] + + +data_lattice_process_background = [ + ["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"], + ["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"], + ["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""], + ["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"], + ["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"], + ["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"], + ["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"], + ["Total","","47","92","11.81","22,455","19,584","10,644"] +] + + +data_lattice_copy_text = [ + ["Plan Type","County","Plan Name","Totals"], + ["GMC","Sacramento","Anthem Blue Cross","164,380"], + ["GMC","Sacramento","Health Net","126,547"], + ["GMC","Sacramento","Kaiser Foundation","74,620"], + ["GMC","Sacramento","Molina Healthcare","59,989"], + ["GMC","San Diego","Care 1st Health Plan","71,831"], + ["GMC","San Diego","Community Health Group","264,639"], + ["GMC","San Diego","Health Net","72,404"], + ["GMC","San Diego","Kaiser","50,415"], + ["GMC","San Diego","Molina Healthcare","206,430"], + ["GMC","Total GMC Enrollment","","1,091,255"], + ["COHS","Marin","Partnership Health Plan of CA","36,006"], + ["COHS","Mendocino","Partnership Health Plan of CA","37,243"], + ["COHS","Napa","Partnership Health Plan of CA","28,398"], + ["COHS","Solano","Partnership Health Plan of CA","113,220"], + ["COHS","Sonoma","Partnership Health Plan of CA","112,271"], + ["COHS","Yolo","Partnership Health Plan of CA","52,674"], + ["COHS","Del Norte","Partnership Health Plan of CA","11,242"], + ["COHS","Humboldt","Partnership Health Plan of CA","49,911"], + ["COHS","Lake","Partnership Health Plan of CA","29,149"], + ["COHS","Lassen","Partnership Health Plan of CA","7,360"], + ["COHS","Modoc","Partnership Health Plan of CA","2,940"], + ["COHS","Shasta","Partnership Health Plan of CA","61,763"], + ["COHS","Siskiyou","Partnership Health Plan of CA","16,715"], + ["COHS","Trinity","Partnership Health Plan of CA","4,542"], + ["COHS","Merced","Central California Alliance for Health","123,907"], + ["COHS","Monterey","Central California Alliance for Health","147,397"], + ["COHS","Santa Cruz","Central California Alliance for Health","69,458"], + ["COHS","Santa Barbara","CenCal","117,609"], + ["COHS","San Luis Obispo","CenCal","55,761"], + ["COHS","Orange","CalOptima","783,079"], + ["COHS","San Mateo","Health Plan of San Mateo","113,202"], + ["COHS","Ventura","Gold Coast Health Plan","202,217"], + ["COHS","Total COHS Enrollment","","2,176,064"], + ["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"], + ["PCCM","Los Angeles","AIDS Healthcare Foundation","828"], + ["PCCM","San Francisco","Family Mosaic","25"], + ["PCCM","Total PHP Enrollment","","853"], + ["All Models Total Enrollments","","","10,132,875"], + ["Source: Data Warehouse 12/14/15","","",""] +] \ No newline at end of file