Remove ncolumns everywhere
parent
edcf770d93
commit
3651fb2347
116
docs/stream.rst
116
docs/stream.rst
|
|
@ -59,122 +59,6 @@ Let's run it on this pdf.
|
|||
"","","","","Working..."
|
||||
"","","","","Febuary..."
|
||||
|
||||
But sometimes its guess could be incorrect, like in this case.
|
||||
|
||||
::
|
||||
|
||||
>>> from camelot.pdf import Pdf
|
||||
>>> from camelot.stream import Stream
|
||||
|
||||
>>> manager = Pdf(Stream(), 'missing_values.pdf')
|
||||
>>> tables = manager.extract()
|
||||
>>> print tables['page-1']['table-1']['data']
|
||||
|
||||
.. .. _this: insert link for missing_values.pdf
|
||||
|
||||
.. csv-table::
|
||||
|
||||
"Bhandara...","",""
|
||||
"","DLHS-4...","DLHS-3..."
|
||||
"Indicators","TOTAL","RURAL TOTAL RURAL"
|
||||
"Reported Prevalence of Morbidity","",""
|
||||
"Any Injury...","1.9","2.1"
|
||||
"Acute Illness...","4.5","5.6"
|
||||
"Chronic Illness...","5.1","4.1"
|
||||
"Reported Prevalence of Chronic Illness during last one year (%)","",""
|
||||
"Disease of respiratory system...","11.7","15.0"
|
||||
"Disease of cardiovascular system...","8.9","9.3"
|
||||
"Persons suffering from tuberculosis...","2.2","1.5"
|
||||
"Anaemia Status by Haemoglobin Level14 (%)","",""
|
||||
"Children (6-59 months) having anaemia...","68.5","71.9"
|
||||
"Children (6-59 months) having severe anaemia...","6.7","9.4"
|
||||
"Children (6-9 Years) having anaemia - Male...","67.1","71.4"
|
||||
"Children (6-9 Years) having severe anaemia - Male...","4.4","2.4"
|
||||
"Children (6-9 Years) having anaemia - Female...","52.4","48.8"
|
||||
"Children (6-9 Years) having severe anaemia - Female...","1.2","0.0"
|
||||
"Children (6-14 years) having anaemia - Male...","50.8","62.5"
|
||||
"Children (6-14 years) having severe anaemia - Male...","3.7","3.6"
|
||||
"Children (6-14 years) having anaemia - Female...","48.3","50.0"
|
||||
"Children (6-14 years) having severe anaemia - Female...","4.3","6.1"
|
||||
"Children (10-19 Years15) having anaemia - Male...","37.9","51.2"
|
||||
"Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0"
|
||||
"Children (10-19 Years15) having anaemia - Female...","46.6","52.1"
|
||||
"Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5"
|
||||
"Adolescents (15-19 years) having anaemia...","39.4","46.5"
|
||||
"Adolescents (15-19 years) having severe anaemia...","5.4","5.1"
|
||||
"Pregnant women (15-49 aged) having anaemia...","48.8","51.5"
|
||||
"Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8"
|
||||
"Women (15-49 aged) having anaemia...","45.2","51.7"
|
||||
"Women (15-49 aged) having severe anaemia...","4.8","5.9"
|
||||
"Persons (20 years and above) having anaemia...","37.8","42.1"
|
||||
"Persons (20 years and above) having Severe anaemia...","4.6","4.8"
|
||||
"Blood Sugar Level (age 18 years and above) (%)","",""
|
||||
"Blood Sugar Level >140 mg/dl (high)...","12.9","11.1"
|
||||
"Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1"
|
||||
"Hypertension (age 18 years and above) (%)","",""
|
||||
"Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8"
|
||||
"Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1"
|
||||
"Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1"
|
||||
"14...","",""
|
||||
"Chronic...","",""
|
||||
|
||||
It guessed that the pdf has 3 columns, because there wasn't any data in the last 2 columns for most rows. So, let's specify the number of columns explicitly, following which, Stream will only consider rows that have 5 words, to decide on column boundaries.
|
||||
|
||||
::
|
||||
|
||||
>>> from camelot.pdf import Pdf
|
||||
>>> from camelot.stream import Stream
|
||||
|
||||
>>> manager = Pdf(Stream(ncolumns=[5]), 'missing_values.pdf')
|
||||
>>> tables = manager.extract()
|
||||
>>> print tables['page-1']['table-1']['data']
|
||||
|
||||
.. csv-table::
|
||||
|
||||
"Bhandara...","","","",""
|
||||
"","DLHS-4...","DLHS-3...","",""
|
||||
"Indicators","TOTAL","RURAL","TOTAL","RURAL"
|
||||
"Reported Prevalence of Morbidity","","","",""
|
||||
"Any Injury...","1.9","2.1","",""
|
||||
"Acute Illness...","4.5","5.6","",""
|
||||
"Chronic Illness...","5.1","4.1","",""
|
||||
"Reported Prevalence of Chronic Illness during last one year (%)","","","",""
|
||||
"Disease of respiratory system...","11.7","15.0","",""
|
||||
"Disease of cardiovascular system...","8.9","9.3","",""
|
||||
"Persons suffering from tuberculosis...","2.2","1.5","",""
|
||||
"Anaemia Status by Haemoglobin Level14 (%)","","","",""
|
||||
"Children (6-59 months) having anaemia...","68.5","71.9","",""
|
||||
"Children (6-59 months) having severe anaemia...","6.7","9.4","",""
|
||||
"Children (6-9 Years) having anaemia - Male...","67.1","71.4","",""
|
||||
"Children (6-9 Years) having severe anaemia - Male...","4.4","2.4","",""
|
||||
"Children (6-9 Years) having anaemia - Female...","52.4","48.8","",""
|
||||
"Children (6-9 Years) having severe anaemia - Female...","1.2","0.0","",""
|
||||
"Children (6-14 years) having anaemia - Male...","50.8","62.5","",""
|
||||
"Children (6-14 years) having severe anaemia - Male...","3.7","3.6","",""
|
||||
"Children (6-14 years) having anaemia - Female...","48.3","50.0","",""
|
||||
"Children (6-14 years) having severe anaemia - Female...","4.3","6.1","",""
|
||||
"Children (10-19 Years15) having anaemia - Male...","37.9","51.2","",""
|
||||
"Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0","",""
|
||||
"Children (10-19 Years15) having anaemia - Female...","46.6","52.1","",""
|
||||
"Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5","",""
|
||||
"Adolescents (15-19 years) having anaemia...","39.4","46.5","",""
|
||||
"Adolescents (15-19 years) having severe anaemia...","5.4","5.1","",""
|
||||
"Pregnant women (15-49 aged) having anaemia...","48.8","51.5","",""
|
||||
"Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8","",""
|
||||
"Women (15-49 aged) having anaemia...","45.2","51.7","",""
|
||||
"Women (15-49 aged) having severe anaemia...","4.8","5.9","",""
|
||||
"Persons (20 years and above) having anaemia...","37.8","42.1","",""
|
||||
"Persons (20 years and above) having Severe anaemia...","4.6","4.8","",""
|
||||
"Blood Sugar Level (age 18 years and above) (%)","","","",""
|
||||
"Blood Sugar Level >140 mg/dl (high)...","12.9","11.1","",""
|
||||
"Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1","",""
|
||||
"Hypertension (age 18 years and above) (%)","","","",""
|
||||
"Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8","",""
|
||||
"Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1","",""
|
||||
"Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1","",""
|
||||
"14...","","","",""
|
||||
"Chronic...","","","",""
|
||||
|
||||
We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this pdf file.
|
||||
|
||||
::
|
||||
|
|
|
|||
|
|
@ -1,8 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Stream
|
||||
|
||||
|
||||
extractor = Stream(Pdf("files/missing_values.pdf",
|
||||
char_margin=1.0, clean=True), ncolumns=5)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -443,16 +443,12 @@ if __name__ == '__main__':
|
|||
try:
|
||||
tarea = args['--tarea'] if args['--tarea'] else None
|
||||
columns = args['--columns'] if args['--columns'] else None
|
||||
if args['--ncols'] and args['--ncols'] != ['-1']:
|
||||
ncolumns = [int(nc) for nc in args['--ncols']]
|
||||
else:
|
||||
ncolumns = None
|
||||
header = args['--header'] if args['--header'] else None
|
||||
ytol = [int(y) for y in args['--ytol']]
|
||||
mtol = [int(m) for m in args['--mtol']]
|
||||
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
||||
ncolumns=ncolumns, headers=header, ytol=ytol,
|
||||
mtol=mtol, margins=margins, split_text=args['--split_text'],
|
||||
headers=header, ytol=ytol, mtol=mtol,
|
||||
margins=margins, split_text=args['--split_text'],
|
||||
flag_size=args['--flag_size'], debug=args['--debug']),
|
||||
filename,
|
||||
pagenos=p,
|
||||
|
|
|
|||
Loading…
Reference in New Issue