Port tests

pull/2/head
Vinayak Mehta 2018-09-09 05:29:24 +05:30
parent c4d3cac4fb
commit 7c3e531b07
14 changed files with 281 additions and 346 deletions

2
.gitignore vendored
View File

@ -6,3 +6,5 @@ build/
dist/
*.egg-info/
.coverage
.pytest_cache/

View File

@ -29,7 +29,7 @@ class Cell(object):
self._text = ''
def __repr__(self):
return '<Cell x1={} y1={} x2={} y2={}'.format(
return '<Cell x1={} y1={} x2={} y2={}>'.format(
self.x1, self.y1, self.x2, self.y2)
@property
@ -80,7 +80,7 @@ class Table(object):
self.page = None
def __repr__(self):
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
return '<{} shape={}>'.format(self.__class__.__name__, self.shape)
@property
def data(self):

View File

@ -25,8 +25,8 @@ class Lattice(BaseParser):
"""
def __init__(self, table_area=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=True, line_close_tol=2,
joint_close_tol=2, blocksize=15, threshold_constant=-2,
split_text=False, flag_size=False, line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, margins=(1.0, 0.5, 0.1), debug=None):
self.table_area = table_area
self.process_background = process_background
@ -37,7 +37,7 @@ class Lattice(BaseParser):
self.flag_size = flag_size
self.line_close_tol = line_close_tol
self.joint_close_tol = joint_close_tol
self.blocksize = blocksize
self.threshold_blocksize = threshold_blocksize
self.threshold_constant = threshold_constant
self.iterations = iterations
self.char_margin, self.line_margin, self.word_margin = margins
@ -98,7 +98,7 @@ class Lattice(BaseParser):
def _generate_table_bbox(self):
self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background,
blocksize=self.blocksize, c=self.threshold_constant)
blocksize=self.threshold_blocksize, c=self.threshold_constant)
image_width = self.image.shape[1]
image_height = self.image.shape[0]
image_width_scaler = image_width / float(self.pdf_width)
@ -173,10 +173,10 @@ class Lattice(BaseParser):
table = Table(cols, rows)
# set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol)
# set spanning cells to True
table = table.set_span()
# set table border edges to True
table = table.set_border()
# set spanning cells to True
table = table.set_span()
pos_errors = []
for direction in self.t_bbox:

View File

@ -19,7 +19,7 @@ class Stream(BaseParser):
"""
def __init__(self, table_area=None, columns=None, split_text=False,
flag_size=True, row_close_tol=2, col_close_tol=0,
flag_size=False, row_close_tol=2, col_close_tol=0,
margins=(1.0, 0.5, 0.1), debug=None):
self.table_area = table_area
self.columns = columns

View File

@ -48,10 +48,8 @@ def setup_package():
author=AUTHOR,
author_email=AUTHOR_EMAIL,
license=LICENSE,
keywords='parse scrape pdf table',
packages=['camelot'],
install_requires=reqs,
scripts=['tools/camelot'])
install_requires=reqs)
try:
from setuptools import setup

View File

@ -1,334 +0,0 @@
import os
from nose.tools import assert_equal
from camelot.pdf import Pdf
from camelot.lattice import Lattice
testdir = os.path.dirname(os.path.abspath(__file__))
def test_stream_basic():
data = [
["", "Table 6.", ""],
["", "U.S. Production, Imports, Exports, and Net Supply of Conventional Pesticides", ""],
["", "at Producer Level, 1994/95 Estimates.", ""],
["", "Active Ingredient", "Sales Value"],
["", "(in billions of lbs.)", "(in billions of dollars)"],
["Category", "1994/95", "1994/95"],
["U.S. Production", "1.3", "7.0"],
["U.S. Imports", "0.2", "2.2"],
["Total Supply", "1.5", "9.2"],
["U.S. Exports", "0.5", "2.6"],
["Net Supply/Usage", "1.0", "6.6"],
["SOURCE:", "EPA estimates based on ACPA Surveys, Department of Commerce Publications, tabulations and other", ""],
["sources.", "", ""],
["16\xe2\x80\x9494/95 Pesticides Industry Sales And Usage", "", ""]
]
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-024.pdf")
manager = Pdf(Stream(), pdfname, pagenos=[{"start": 1, "end": 1}],
clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_missing_value():
data = [
["Bhandara - Key Indicators","","","",""],
["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
["Reported Prevalence of Morbidity","","","",""],
["Any Injury .....................................................................................................................................","1.9","2.1","",""],
["Acute Illness .................................................................................................................................","4.5","5.6","",""],
["Chronic Illness ..............................................................................................................................","5.1","4.1","",""],
["Reported Prevalence of Chronic Illness during last one year (%)","","","",""],
["Disease of respiratory system ......................................................................................................","11.7","15.0","",""],
["Disease of cardiovascular system ................................................................................................","8.9","9.3","",""],
["Persons suffering from tuberculosis .............................................................................................","2.2","1.5","",""],
["Anaemia Status by Haemoglobin Level14 (%)","","","",""],
["Children (6-59 months) having anaemia ......................................................................................","68.5","71.9","",""],
["Children (6-59 months) having severe anaemia ..........................................................................","6.7","9.4","",""],
["Children (6-9 Years) having anaemia - Male ................................................................................","67.1","71.4","",""],
["Children (6-9 Years) having severe anaemia - Male ....................................................................","4.4","2.4","",""],
["Children (6-9 Years) having anaemia - Female ...........................................................................","52.4","48.8","",""],
["Children (6-9 Years) having severe anaemia - Female ................................................................","1.2","0.0","",""],
["Children (6-14 years) having anaemia - Male .............................................................................","50.8","62.5","",""],
["Children (6-14 years) having severe anaemia - Male ..................................................................","3.7","3.6","",""],
["Children (6-14 years) having anaemia - Female .........................................................................","48.3","50.0","",""],
["Children (6-14 years) having severe anaemia - Female ..............................................................","4.3","6.1","",""],
["Children (10-19 Years15) having anaemia - Male .........................................................................","37.9","51.2","",""],
["Children (10-19 Years15) having severe anaemia - Male .............................................................","3.5","4.0","",""],
["Children (10-19 Years15) having anaemia - Female .....................................................................","46.6","52.1","",""],
["Children (10-19 Years15) having severe anaemia - Female .........................................................","6.4","6.5","",""],
["Adolescents (15-19 years) having anaemia ................................................................................","39.4","46.5","",""],
["Adolescents (15-19 years) having severe anaemia .....................................................................","5.4","5.1","",""],
["Pregnant women (15-49 aged) having anaemia ..........................................................................","48.8","51.5","",""],
["Pregnant women (15-49 aged) having severe anaemia ..............................................................","7.1","8.8","",""],
["Women (15-49 aged) having anaemia .........................................................................................","45.2","51.7","",""],
["Women (15-49 aged) having severe anaemia .............................................................................","4.8","5.9","",""],
["Persons (20 years and above) having anaemia ...........................................................................","37.8","42.1","",""],
["Persons (20 years and above) having Severe anaemia ..............................................................","4.6","4.8","",""],
["Blood Sugar Level (age 18 years and above) (%)","","","",""],
["Blood Sugar Level >140 mg/dl (high) ...........................................................................................","12.9","11.1","",""],
["Blood Sugar Level >160 mg/dl (very high) ...................................................................................","7.0","5.1","",""],
["Hypertension (age 18 years and above) (%)","","","",""],
["Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg ) ..............................","23.8","22.8","",""],
["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
["4","","","",""]
]
pdfname = os.path.join(testdir, "missing_values.pdf")
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_single_table_area():
data = [
["","One Withholding"],
["Payroll Period","Allowance"],
["Weekly","$71.15"],
["Biweekly","142.31"],
["Semimonthly","154.17"],
["Monthly","308.33"],
["Quarterly","925.00"],
["Semiannually","1,850.00"],
["Annually","3,700.00"],
["Daily or Miscellaneous","14.23"],
["(each day of the payroll period)",""]
]
pdfname = os.path.join(testdir, "tabula_test_pdfs/us-007.pdf")
manager = Pdf(Stream(table_area=["320,500,573,335"]),
pdfname, pagenos=[{"start": 1, "end": 1}], clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_columns():
data = [
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["Entidad","","Municipio","","Localidad",""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
]
pdfname = os.path.join(testdir, "mexican_towns.pdf")
manager = Pdf(Stream(columns=["67,180,230,425,475"], ytol=[10]), pdfname,
clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_stream_table_rotation():
data = [
["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
["","","","","","","Modern method","","","","","","","Traditional method","","","",""],
["","","","Any","","","","","","","Other","Any","","","","Not","","Number"],
["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
["","Caste/tribe","","","","","","","","","","","","","","","",""],
["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
["","Wealth index","","","","","","","","","","","","","","","",""],
["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
["","Number of living children","","","","","","","","","","","","","","","",""],
["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
["","not shown separately.","","","","","","","","","","","","","","","",""],
["","na = Not available","","","","","","","","","","","","","","","",""],
["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
["","","","","","","","","54","","","","","","","","",""]
]
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
pdfname = os.path.join(testdir, "right_rotated_table_2.pdf")
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables["page-1"]["table-1"]["data"], data)
def test_lattice_basic():
data = [
["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""],
["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"],
["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"],
["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"],
["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"],
["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"],
["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"]
]
pdfname = os.path.join(testdir,
"tabula_test_pdfs/icdar2013-dataset/competition-dataset-us/us-030.pdf")
manager = Pdf(Lattice(), pdfname, pagenos=[{'start': 2, 'end': 2}],
clean=True)
tables = manager.extract()
assert_equal(tables['page-2']['table-1']['data'], data)
def test_lattice_fill():
data = [
["Plan Type","County","Plan Name","Totals"],
["GMC","Sacramento","Anthem Blue Cross","164,380"],
["GMC","Sacramento","Health Net","126,547"],
["GMC","Sacramento","Kaiser Foundation","74,620"],
["GMC","Sacramento","Molina Healthcare","59,989"],
["GMC","San Diego","Care 1st Health Plan","71,831"],
["GMC","San Diego","Community Health Group","264,639"],
["GMC","San Diego","Health Net","72,404"],
["GMC","San Diego","Kaiser","50,415"],
["GMC","San Diego","Molina Healthcare","206,430"],
["GMC","Total GMC Enrollment","","1,091,255"],
["COHS","Marin","Partnership Health Plan of CA","36,006"],
["COHS","Mendocino","Partnership Health Plan of CA","37,243"],
["COHS","Napa","Partnership Health Plan of CA","28,398"],
["COHS","Solano","Partnership Health Plan of CA","113,220"],
["COHS","Sonoma","Partnership Health Plan of CA","112,271"],
["COHS","Yolo","Partnership Health Plan of CA","52,674"],
["COHS","Del Norte","Partnership Health Plan of CA","11,242"],
["COHS","Humboldt","Partnership Health Plan of CA","49,911"],
["COHS","Lake","Partnership Health Plan of CA","29,149"],
["COHS","Lassen","Partnership Health Plan of CA","7,360"],
["COHS","Modoc","Partnership Health Plan of CA","2,940"],
["COHS","Shasta","Partnership Health Plan of CA","61,763"],
["COHS","Siskiyou","Partnership Health Plan of CA","16,715"],
["COHS","Trinity","Partnership Health Plan of CA","4,542"],
["COHS","Merced","Central California Alliance for Health","123,907"],
["COHS","Monterey","Central California Alliance for Health","147,397"],
["COHS","Santa Cruz","Central California Alliance for Health","69,458"],
["COHS","Santa Barbara","CenCal","117,609"],
["COHS","San Luis Obispo","CenCal","55,761"],
["COHS","Orange","CalOptima","783,079"],
["COHS","San Mateo","Health Plan of San Mateo","113,202"],
["COHS","Ventura","Gold Coast Health Plan","202,217"],
["COHS","Total COHS Enrollment","","2,176,064"],
["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"],
["PCCM","Los Angeles","AIDS Healthcare Foundation","828"],
["PCCM","San Francisco","Family Mosaic","25"],
["PCCM","Total PHP Enrollment","","853"],
["All Models Total Enrollments","","","10,132,875"],
["Source: Data Warehouse 12/14/15","","",""]
]
pdfname = os.path.join(testdir, 'row_span_1.pdf')
manager = Pdf(Lattice(fill='v', scale=40), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)
def test_lattice_invert():
data = [
["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"],
["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"],
["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""],
["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"],
["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"],
["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"],
["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"],
["Total","","47","92","11.81","22,455","19,584","10,644"]
]
pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
manager = Pdf(Lattice(invert=True), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-2']['data'], data)
def test_lattice_table_rotation():
data = [
["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""],
["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"],
["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"],
["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"],
["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"],
["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"],
["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"],
["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"],
["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"],
["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"],
["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"],
["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"],
["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"]
]
pdfname = os.path.join(testdir, 'left_rotated_table_1.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)
pdfname = os.path.join(testdir, 'right_rotated_table_1.pdf')
manager = Pdf(Lattice(), pdfname, clean=True)
tables = manager.extract()
assert_equal(tables['page-1']['table-1']['data'], data)

View File

@ -0,0 +1,80 @@
import os
import pandas as pd
import camelot
from test_data import *
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
def test_stream():
pass
def test_stream_table_rotated():
df = pd.DataFrame(data_stream_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename)
assert df.equals(tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename)
assert df.equals(tables[0].df)
def test_stream_table_area():
df = pd.DataFrame(data_stream_table_area_single)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, table_area=["320,500,573,335"])
assert df.equals(tables[0].df)
def test_stream_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, columns=["67,180,230,425,475"], row_close_tol=10)
assert df.equals(tables[0].df)
def test_lattice():
df = pd.DataFrame(data_lattice)
filename = os.path.join(testdir,
"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
tables = camelot.read_pdf(filename, pages="2", mesh=True)
assert df.equals(tables[0].df)
def test_lattice_table_rotated():
df = pd.DataFrame(data_lattice_table_rotated)
filename = os.path.join(testdir, "clockwise_table_1.pdf")
tables = camelot.read_pdf(filename, mesh=True)
assert df.equals(tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
tables = camelot.read_pdf(filename, mesh=True)
assert df.equals(tables[0].df)
def test_lattice_process_background():
df = pd.DataFrame(data_lattice_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(filename, mesh=True, process_background=True)
assert df.equals(tables[1].df)
def test_lattice_copy_text():
df = pd.DataFrame(data_lattice_copy_text)
filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, mesh=True, line_size_scaling=60, copy_text="v")
assert df.equals(tables[0].df)

189
tests/test_data.py 100644
View File

@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
data_stream_table_rotated = [
["","","Table 21 Current use of contraception by background characteristics—Continued","","","","","","","","","","","","","","",""],
["","","","","","","Modern method","","","","","","","Traditional method","","","",""],
["","","","Any","","","","","","","Other","Any","","","","Not","","Number"],
["","","Any","modern","Female","Male","","","","Condom/","modern","traditional","","With-","Folk","currently","","of"],
["","Background characteristic","method","method","sterilization","sterilization","Pill","IUD","Injectables","Nirodh","method","method","Rhythm","drawal","method","using","Total","women"],
["","Caste/tribe","","","","","","","","","","","","","","","",""],
["","Scheduled caste","74.8","55.8","42.9","0.9","9.7","0.0","0.2","2.2","0.0","19.0","11.2","7.4","0.4","25.2","100.0","1,363"],
["","Scheduled tribe","59.3","39.0","26.8","0.6","6.4","0.6","1.2","3.5","0.0","20.3","10.4","5.8","4.1","40.7","100.0","256"],
["","Other backward class","71.4","51.1","34.9","0.0","8.6","1.4","0.0","6.2","0.0","20.4","12.6","7.8","0.0","28.6","100.0","211"],
["","Other","71.1","48.8","28.2","0.8","13.3","0.9","0.3","5.2","0.1","22.3","12.9","9.1","0.3","28.9","100.0","3,319"],
["","Wealth index","","","","","","","","","","","","","","","",""],
["","Lowest","64.5","48.6","34.3","0.5","10.5","0.6","0.7","2.0","0.0","15.9","9.9","4.6","1.4","35.5","100.0","1,258"],
["","Second","68.5","50.4","36.2","1.1","11.4","0.5","0.1","1.1","0.0","18.1","11.2","6.7","0.2","31.5","100.0","1,317"],
["","Middle","75.5","52.8","33.6","0.6","14.2","0.4","0.5","3.4","0.1","22.7","13.4","8.9","0.4","24.5","100.0","1,018"],
["","Fourth","73.9","52.3","32.0","0.5","12.5","0.6","0.2","6.3","0.2","21.6","11.5","9.9","0.2","26.1","100.0","908"],
["","Highest","78.3","44.4","19.5","1.0","9.7","1.4","0.0","12.7","0.0","33.8","18.2","15.6","0.0","21.7","100.0","733"],
["","Number of living children","","","","","","","","","","","","","","","",""],
["","No children","25.1","7.6","0.3","0.5","2.0","0.0","0.0","4.8","0.0","17.5","9.0","8.5","0.0","74.9","100.0","563"],
["","1 child","66.5","32.1","3.7","0.7","20.1","0.7","0.1","6.9","0.0","34.3","18.9","15.2","0.3","33.5","100.0","1,190"],
["","1 son","66.8","33.2","4.1","0.7","21.1","0.5","0.3","6.6","0.0","33.5","21.2","12.3","0.0","33.2","100.0","672"],
["","No sons","66.1","30.7","3.1","0.6","18.8","0.8","0.0","7.3","0.0","35.4","15.8","19.0","0.6","33.9","100.0","517"],
["","2 children","81.6","60.5","41.8","0.9","11.6","0.8","0.3","4.8","0.2","21.1","12.2","8.3","0.6","18.4","100.0","1,576"],
["","1 or more sons","83.7","64.2","46.4","0.9","10.8","0.8","0.4","4.8","0.1","19.5","11.1","7.6","0.7","16.3","100.0","1,268"],
["","No sons","73.2","45.5","23.2","1.0","15.1","0.9","0.0","4.8","0.5","27.7","16.8","11.0","0.0","26.8","100.0","308"],
["","3 children","83.9","71.2","57.7","0.8","9.8","0.6","0.5","1.8","0.0","12.7","8.7","3.3","0.8","16.1","100.0","961"],
["","1 or more sons","85.0","73.2","60.3","0.9","9.4","0.5","0.5","1.6","0.0","11.8","8.1","3.0","0.7","15.0","100.0","860"],
["","No sons","74.7","53.8","35.3","0.0","13.7","1.6","0.0","3.2","0.0","20.9","13.4","6.1","1.5","25.3","100.0","101"],
["","4+ children","74.3","58.1","45.1","0.6","8.7","0.6","0.7","2.4","0.0","16.1","9.9","5.4","0.8","25.7","100.0","944"],
["","1 or more sons","73.9","58.2","46.0","0.7","8.3","0.7","0.7","1.9","0.0","15.7","9.4","5.5","0.8","26.1","100.0","901"],
["","No sons","(82.1)","(57.3)","(25.6)","(0.0)","(17.8)","(0.0)","(0.0)","(13.9)","(0.0)","(24.8)","(21.3)","(3.5)","(0.0)","(17.9)","100.0","43"],
["","Total","71.2","49.9","32.2","0.7","11.7","0.6","0.3","4.3","0.1","21.3","12.3","8.4","0.5","28.8","100.0","5,234"],
["","NFHS-2 (1998-99)","66.6","47.3","32.0","1.8","9.2","1.4","na","2.9","na","na","8.7","9.8","na","33.4","100.0","4,116"],
["","NFHS-1 (1992-93)","57.7","37.6","26.5","4.3","3.6","1.3","0.1","1.9","na","na","11.3","8.3","na","42.3","100.0","3,970"],
["","","Note: If more than one method is used, only the most effective method is considered in this tabulation. Total includes women for whom caste/tribe was not known or is missing, who are","","","","","","","","","","","","","","",""],
["","not shown separately.","","","","","","","","","","","","","","","",""],
["","na = Not available","","","","","","","","","","","","","","","",""],
["","","ns = Not shown; see table 2b, footnote 1","","","","","","","","","","","","","","",""],
["","( ) Based on 25-49 unweighted cases.","","","","","","","","","","","","","","","",""],
["","","","","","","","","54","","","","","","","","",""]
]
data_stream_table_area_single = [
["","One Withholding"],
["Payroll Period","Allowance"],
["Weekly","$71.15"],
["Biweekly","142.31"],
["Semimonthly","154.17"],
["Monthly","308.33"],
["Quarterly","925.00"],
["Semiannually","1,850.00"],
["Annually","3,700.00"],
["Daily or Miscellaneous","14.23"],
["(each day of the payroll period)",""]
]
data_stream_columns = [
["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
["Entidad","","Municipio","","Localidad",""],
["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
]
data_lattice = [
["Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""],
["","","","Improved Speed","Decreased Accel","Eliminate Stops","Decreased Idle"],
["2012_2","3.30","1.3","5.9%","9.5%","29.2%","17.4%"],
["2145_1","0.68","11.2","2.4%","0.1%","9.5%","2.7%"],
["4234_1","0.59","58.7","8.5%","1.3%","8.5%","3.3%"],
["2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"],
["4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"]
]
data_lattice_table_rotated = [
["State","Nutritional Assessment (No. of individuals)","","","","IYCF Practices (No. of mothers: 2011-12)","Blood Pressure (No. of adults: 2011-12)","","Fasting Blood Sugar (No. of adults:2011-12)",""],
["","1975-79","1988-90","1996-97","2011-12","","Men","Women","Men","Women"],
["Kerala","5738","6633","8864","8297","245","2161","3195","1645","2391"],
["Tamil Nadu","7387","10217","5813","7851","413","2134","2858","1119","1739"],
["Karnataka","6453","8138","12606","8958","428","2467","2894","1628","2028"],
["Andhra Pradesh","5844","9920","9545","8300","557","1899","2493","1111","1529"],
["Maharashtra","5161","7796","6883","9525","467","2368","2648","1417","1599"],
["Gujarat","4403","5374","4866","9645","477","2687","3021","2122","2503"],
["Madhya Pradesh","*","*","*","7942","470","1965","2150","1579","1709"],
["Orissa","3756","5540","12024","8473","398","2040","2624","1093","1628"],
["West Bengal","*","*","*","8047","423","2058","2743","1413","2027"],
["Uttar Pradesh","*","*","*","9860","581","2139","2415","1185","1366"],
["Pooled","38742","53618","60601","86898","4459","21918","27041","14312","18519"]
]
data_lattice_process_background = [
["State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"],
["Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000"],
["Rajasthan","2.12.2009 to 19.12.2009","","","","","",""],
["Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453"],
["Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153"],
["Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183"],
["Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855"],
["Total","","47","92","11.81","22,455","19,584","10,644"]
]
data_lattice_copy_text = [
["Plan Type","County","Plan Name","Totals"],
["GMC","Sacramento","Anthem Blue Cross","164,380"],
["GMC","Sacramento","Health Net","126,547"],
["GMC","Sacramento","Kaiser Foundation","74,620"],
["GMC","Sacramento","Molina Healthcare","59,989"],
["GMC","San Diego","Care 1st Health Plan","71,831"],
["GMC","San Diego","Community Health Group","264,639"],
["GMC","San Diego","Health Net","72,404"],
["GMC","San Diego","Kaiser","50,415"],
["GMC","San Diego","Molina Healthcare","206,430"],
["GMC","Total GMC Enrollment","","1,091,255"],
["COHS","Marin","Partnership Health Plan of CA","36,006"],
["COHS","Mendocino","Partnership Health Plan of CA","37,243"],
["COHS","Napa","Partnership Health Plan of CA","28,398"],
["COHS","Solano","Partnership Health Plan of CA","113,220"],
["COHS","Sonoma","Partnership Health Plan of CA","112,271"],
["COHS","Yolo","Partnership Health Plan of CA","52,674"],
["COHS","Del Norte","Partnership Health Plan of CA","11,242"],
["COHS","Humboldt","Partnership Health Plan of CA","49,911"],
["COHS","Lake","Partnership Health Plan of CA","29,149"],
["COHS","Lassen","Partnership Health Plan of CA","7,360"],
["COHS","Modoc","Partnership Health Plan of CA","2,940"],
["COHS","Shasta","Partnership Health Plan of CA","61,763"],
["COHS","Siskiyou","Partnership Health Plan of CA","16,715"],
["COHS","Trinity","Partnership Health Plan of CA","4,542"],
["COHS","Merced","Central California Alliance for Health","123,907"],
["COHS","Monterey","Central California Alliance for Health","147,397"],
["COHS","Santa Cruz","Central California Alliance for Health","69,458"],
["COHS","Santa Barbara","CenCal","117,609"],
["COHS","San Luis Obispo","CenCal","55,761"],
["COHS","Orange","CalOptima","783,079"],
["COHS","San Mateo","Health Plan of San Mateo","113,202"],
["COHS","Ventura","Gold Coast Health Plan","202,217"],
["COHS","Total COHS Enrollment","","2,176,064"],
["Subtotal for Two-Plan, Regional Model, GMC and COHS","","","10,132,022"],
["PCCM","Los Angeles","AIDS Healthcare Foundation","828"],
["PCCM","San Francisco","Family Mosaic","25"],
["PCCM","Total PHP Enrollment","","853"],
["All Models Total Enrollments","","","10,132,875"],
["Source: Data Warehouse 12/14/15","","",""]
]