Remove ncolumns everywhere

pull/2/head
Vinayak Mehta 2017-03-01 19:53:48 +05:30
parent edcf770d93
commit 3651fb2347
3 changed files with 2 additions and 130 deletions

View File

@ -59,122 +59,6 @@ Let's run it on this pdf.
"","","","","Working..." "","","","","Working..."
"","","","","Febuary..." "","","","","Febuary..."
But sometimes its guess could be incorrect, like in this case.
::
>>> from camelot.pdf import Pdf
>>> from camelot.stream import Stream
>>> manager = Pdf(Stream(), 'missing_values.pdf')
>>> tables = manager.extract()
>>> print tables['page-1']['table-1']['data']
.. .. _this: insert link for missing_values.pdf
.. csv-table::
"Bhandara...","",""
"","DLHS-4...","DLHS-3..."
"Indicators","TOTAL","RURAL TOTAL RURAL"
"Reported Prevalence of Morbidity","",""
"Any Injury...","1.9","2.1"
"Acute Illness...","4.5","5.6"
"Chronic Illness...","5.1","4.1"
"Reported Prevalence of Chronic Illness during last one year (%)","",""
"Disease of respiratory system...","11.7","15.0"
"Disease of cardiovascular system...","8.9","9.3"
"Persons suffering from tuberculosis...","2.2","1.5"
"Anaemia Status by Haemoglobin Level14 (%)","",""
"Children (6-59 months) having anaemia...","68.5","71.9"
"Children (6-59 months) having severe anaemia...","6.7","9.4"
"Children (6-9 Years) having anaemia - Male...","67.1","71.4"
"Children (6-9 Years) having severe anaemia - Male...","4.4","2.4"
"Children (6-9 Years) having anaemia - Female...","52.4","48.8"
"Children (6-9 Years) having severe anaemia - Female...","1.2","0.0"
"Children (6-14 years) having anaemia - Male...","50.8","62.5"
"Children (6-14 years) having severe anaemia - Male...","3.7","3.6"
"Children (6-14 years) having anaemia - Female...","48.3","50.0"
"Children (6-14 years) having severe anaemia - Female...","4.3","6.1"
"Children (10-19 Years15) having anaemia - Male...","37.9","51.2"
"Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0"
"Children (10-19 Years15) having anaemia - Female...","46.6","52.1"
"Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5"
"Adolescents (15-19 years) having anaemia...","39.4","46.5"
"Adolescents (15-19 years) having severe anaemia...","5.4","5.1"
"Pregnant women (15-49 aged) having anaemia...","48.8","51.5"
"Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8"
"Women (15-49 aged) having anaemia...","45.2","51.7"
"Women (15-49 aged) having severe anaemia...","4.8","5.9"
"Persons (20 years and above) having anaemia...","37.8","42.1"
"Persons (20 years and above) having Severe anaemia...","4.6","4.8"
"Blood Sugar Level (age 18 years and above) (%)","",""
"Blood Sugar Level >140 mg/dl (high)...","12.9","11.1"
"Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1"
"Hypertension (age 18 years and above) (%)","",""
"Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8"
"Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1"
"Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1"
"14...","",""
"Chronic...","",""
It guessed that the pdf has 3 columns, because there wasn't any data in the last 2 columns for most rows. So, let's specify the number of columns explicitly, following which, Stream will only consider rows that have 5 words, to decide on column boundaries.
::
>>> from camelot.pdf import Pdf
>>> from camelot.stream import Stream
>>> manager = Pdf(Stream(ncolumns=[5]), 'missing_values.pdf')
>>> tables = manager.extract()
>>> print tables['page-1']['table-1']['data']
.. csv-table::
"Bhandara...","","","",""
"","DLHS-4...","DLHS-3...","",""
"Indicators","TOTAL","RURAL","TOTAL","RURAL"
"Reported Prevalence of Morbidity","","","",""
"Any Injury...","1.9","2.1","",""
"Acute Illness...","4.5","5.6","",""
"Chronic Illness...","5.1","4.1","",""
"Reported Prevalence of Chronic Illness during last one year (%)","","","",""
"Disease of respiratory system...","11.7","15.0","",""
"Disease of cardiovascular system...","8.9","9.3","",""
"Persons suffering from tuberculosis...","2.2","1.5","",""
"Anaemia Status by Haemoglobin Level14 (%)","","","",""
"Children (6-59 months) having anaemia...","68.5","71.9","",""
"Children (6-59 months) having severe anaemia...","6.7","9.4","",""
"Children (6-9 Years) having anaemia - Male...","67.1","71.4","",""
"Children (6-9 Years) having severe anaemia - Male...","4.4","2.4","",""
"Children (6-9 Years) having anaemia - Female...","52.4","48.8","",""
"Children (6-9 Years) having severe anaemia - Female...","1.2","0.0","",""
"Children (6-14 years) having anaemia - Male...","50.8","62.5","",""
"Children (6-14 years) having severe anaemia - Male...","3.7","3.6","",""
"Children (6-14 years) having anaemia - Female...","48.3","50.0","",""
"Children (6-14 years) having severe anaemia - Female...","4.3","6.1","",""
"Children (10-19 Years15) having anaemia - Male...","37.9","51.2","",""
"Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0","",""
"Children (10-19 Years15) having anaemia - Female...","46.6","52.1","",""
"Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5","",""
"Adolescents (15-19 years) having anaemia...","39.4","46.5","",""
"Adolescents (15-19 years) having severe anaemia...","5.4","5.1","",""
"Pregnant women (15-49 aged) having anaemia...","48.8","51.5","",""
"Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8","",""
"Women (15-49 aged) having anaemia...","45.2","51.7","",""
"Women (15-49 aged) having severe anaemia...","4.8","5.9","",""
"Persons (20 years and above) having anaemia...","37.8","42.1","",""
"Persons (20 years and above) having Severe anaemia...","4.6","4.8","",""
"Blood Sugar Level (age 18 years and above) (%)","","","",""
"Blood Sugar Level >140 mg/dl (high)...","12.9","11.1","",""
"Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1","",""
"Hypertension (age 18 years and above) (%)","","","",""
"Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8","",""
"Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1","",""
"Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1","",""
"14...","","","",""
"Chronic...","","","",""
We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this pdf file. We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this pdf file.
:: ::

View File

@ -1,8 +0,0 @@
from camelot import Pdf
from camelot import Stream
extractor = Stream(Pdf("files/missing_values.pdf",
char_margin=1.0, clean=True), ncolumns=5)
tables = extractor.get_tables()
print tables

View File

@ -443,16 +443,12 @@ if __name__ == '__main__':
try: try:
tarea = args['--tarea'] if args['--tarea'] else None tarea = args['--tarea'] if args['--tarea'] else None
columns = args['--columns'] if args['--columns'] else None columns = args['--columns'] if args['--columns'] else None
if args['--ncols'] and args['--ncols'] != ['-1']:
ncolumns = [int(nc) for nc in args['--ncols']]
else:
ncolumns = None
header = args['--header'] if args['--header'] else None header = args['--header'] if args['--header'] else None
ytol = [int(y) for y in args['--ytol']] ytol = [int(y) for y in args['--ytol']]
mtol = [int(m) for m in args['--mtol']] mtol = [int(m) for m in args['--mtol']]
manager = Pdf(Stream(table_area=tarea, columns=columns, manager = Pdf(Stream(table_area=tarea, columns=columns,
ncolumns=ncolumns, headers=header, ytol=ytol, headers=header, ytol=ytol, mtol=mtol,
mtol=mtol, margins=margins, split_text=args['--split_text'], margins=margins, split_text=args['--split_text'],
flag_size=args['--flag_size'], debug=args['--debug']), flag_size=args['--flag_size'], debug=args['--debug']),
filename, filename,
pagenos=p, pagenos=p,