diff --git a/docs/stream.rst b/docs/stream.rst index ca351fa..dd1aa1e 100644 --- a/docs/stream.rst +++ b/docs/stream.rst @@ -59,122 +59,6 @@ Let's run it on this pdf. "","","","","Working..." "","","","","Febuary..." -But sometimes its guess could be incorrect, like in this case. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.stream import Stream - - >>> manager = Pdf(Stream(), 'missing_values.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. .. _this: insert link for missing_values.pdf - -.. csv-table:: - - "Bhandara...","","" - "","DLHS-4...","DLHS-3..." - "Indicators","TOTAL","RURAL TOTAL RURAL" - "Reported Prevalence of Morbidity","","" - "Any Injury...","1.9","2.1" - "Acute Illness...","4.5","5.6" - "Chronic Illness...","5.1","4.1" - "Reported Prevalence of Chronic Illness during last one year (%)","","" - "Disease of respiratory system...","11.7","15.0" - "Disease of cardiovascular system...","8.9","9.3" - "Persons suffering from tuberculosis...","2.2","1.5" - "Anaemia Status by Haemoglobin Level14 (%)","","" - "Children (6-59 months) having anaemia...","68.5","71.9" - "Children (6-59 months) having severe anaemia...","6.7","9.4" - "Children (6-9 Years) having anaemia - Male...","67.1","71.4" - "Children (6-9 Years) having severe anaemia - Male...","4.4","2.4" - "Children (6-9 Years) having anaemia - Female...","52.4","48.8" - "Children (6-9 Years) having severe anaemia - Female...","1.2","0.0" - "Children (6-14 years) having anaemia - Male...","50.8","62.5" - "Children (6-14 years) having severe anaemia - Male...","3.7","3.6" - "Children (6-14 years) having anaemia - Female...","48.3","50.0" - "Children (6-14 years) having severe anaemia - Female...","4.3","6.1" - "Children (10-19 Years15) having anaemia - Male...","37.9","51.2" - "Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0" - "Children (10-19 Years15) having anaemia - Female...","46.6","52.1" - "Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5" - "Adolescents (15-19 years) having anaemia...","39.4","46.5" - "Adolescents (15-19 years) having severe anaemia...","5.4","5.1" - "Pregnant women (15-49 aged) having anaemia...","48.8","51.5" - "Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8" - "Women (15-49 aged) having anaemia...","45.2","51.7" - "Women (15-49 aged) having severe anaemia...","4.8","5.9" - "Persons (20 years and above) having anaemia...","37.8","42.1" - "Persons (20 years and above) having Severe anaemia...","4.6","4.8" - "Blood Sugar Level (age 18 years and above) (%)","","" - "Blood Sugar Level >140 mg/dl (high)...","12.9","11.1" - "Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1" - "Hypertension (age 18 years and above) (%)","","" - "Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8" - "Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1" - "Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1" - "14...","","" - "Chronic...","","" - -It guessed that the pdf has 3 columns, because there wasn't any data in the last 2 columns for most rows. So, let's specify the number of columns explicitly, following which, Stream will only consider rows that have 5 words, to decide on column boundaries. - -:: - - >>> from camelot.pdf import Pdf - >>> from camelot.stream import Stream - - >>> manager = Pdf(Stream(ncolumns=[5]), 'missing_values.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] - -.. csv-table:: - - "Bhandara...","","","","" - "","DLHS-4...","DLHS-3...","","" - "Indicators","TOTAL","RURAL","TOTAL","RURAL" - "Reported Prevalence of Morbidity","","","","" - "Any Injury...","1.9","2.1","","" - "Acute Illness...","4.5","5.6","","" - "Chronic Illness...","5.1","4.1","","" - "Reported Prevalence of Chronic Illness during last one year (%)","","","","" - "Disease of respiratory system...","11.7","15.0","","" - "Disease of cardiovascular system...","8.9","9.3","","" - "Persons suffering from tuberculosis...","2.2","1.5","","" - "Anaemia Status by Haemoglobin Level14 (%)","","","","" - "Children (6-59 months) having anaemia...","68.5","71.9","","" - "Children (6-59 months) having severe anaemia...","6.7","9.4","","" - "Children (6-9 Years) having anaemia - Male...","67.1","71.4","","" - "Children (6-9 Years) having severe anaemia - Male...","4.4","2.4","","" - "Children (6-9 Years) having anaemia - Female...","52.4","48.8","","" - "Children (6-9 Years) having severe anaemia - Female...","1.2","0.0","","" - "Children (6-14 years) having anaemia - Male...","50.8","62.5","","" - "Children (6-14 years) having severe anaemia - Male...","3.7","3.6","","" - "Children (6-14 years) having anaemia - Female...","48.3","50.0","","" - "Children (6-14 years) having severe anaemia - Female...","4.3","6.1","","" - "Children (10-19 Years15) having anaemia - Male...","37.9","51.2","","" - "Children (10-19 Years15) having severe anaemia - Male...","3.5","4.0","","" - "Children (10-19 Years15) having anaemia - Female...","46.6","52.1","","" - "Children (10-19 Years15) having severe anaemia - Female...","6.4","6.5","","" - "Adolescents (15-19 years) having anaemia...","39.4","46.5","","" - "Adolescents (15-19 years) having severe anaemia...","5.4","5.1","","" - "Pregnant women (15-49 aged) having anaemia...","48.8","51.5","","" - "Pregnant women (15-49 aged) having severe anaemia...","7.1","8.8","","" - "Women (15-49 aged) having anaemia...","45.2","51.7","","" - "Women (15-49 aged) having severe anaemia...","4.8","5.9","","" - "Persons (20 years and above) having anaemia...","37.8","42.1","","" - "Persons (20 years and above) having Severe anaemia...","4.6","4.8","","" - "Blood Sugar Level (age 18 years and above) (%)","","","","" - "Blood Sugar Level >140 mg/dl (high)...","12.9","11.1","","" - "Blood Sugar Level >160 mg/dl (very high)...","7.0","5.1","","" - "Hypertension (age 18 years and above) (%)","","","","" - "Above Normal Range (Systolic >140 mm of Hg & Diastolic >90 mm of Hg )...","23.8","22.8","","" - "Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg )...","8.2","7.1","","" - "Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg )...","3.7","3.1","","" - "14...","","","","" - "Chronic...","","","","" - We can also specify the column x-coordinates. We need to call Stream with debug=True and use matplotlib's interface to note down the column x-coordinates we need. Let's try it on this pdf file. :: diff --git a/examples/demo_stream_ncolumns.py b/examples/demo_stream_ncolumns.py deleted file mode 100644 index b220bf1..0000000 --- a/examples/demo_stream_ncolumns.py +++ /dev/null @@ -1,8 +0,0 @@ -from camelot import Pdf -from camelot import Stream - - -extractor = Stream(Pdf("files/missing_values.pdf", - char_margin=1.0, clean=True), ncolumns=5) -tables = extractor.get_tables() -print tables diff --git a/tools/camelot b/tools/camelot index dc4cf4b..c6a48f7 100755 --- a/tools/camelot +++ b/tools/camelot @@ -443,16 +443,12 @@ if __name__ == '__main__': try: tarea = args['--tarea'] if args['--tarea'] else None columns = args['--columns'] if args['--columns'] else None - if args['--ncols'] and args['--ncols'] != ['-1']: - ncolumns = [int(nc) for nc in args['--ncols']] - else: - ncolumns = None header = args['--header'] if args['--header'] else None ytol = [int(y) for y in args['--ytol']] mtol = [int(m) for m in args['--mtol']] manager = Pdf(Stream(table_area=tarea, columns=columns, - ncolumns=ncolumns, headers=header, ytol=ytol, - mtol=mtol, margins=margins, split_text=args['--split_text'], + headers=header, ytol=ytol, mtol=mtol, + margins=margins, split_text=args['--split_text'], flag_size=args['--flag_size'], debug=args['--debug']), filename, pagenos=p,