diff --git a/camelot/cli.py b/camelot/cli.py index 80e04d6..98bb681 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -55,7 +55,7 @@ class Mutex(click.Option): multiple=True, cls=Mutex, help="(with --mesh) Specify direction" " in which text will be copied over in a spanning cell.") @click.option("-shift", "--shift_text", default=["l", "t"], - type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex, + type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex, help="(with --mesh) Specify direction in which text in a spanning" " cell should flow.") @click.option("-l", "--line_close_tol", default=2, cls=Mutex, diff --git a/docs/_static/csv/background_lines.csv b/docs/_static/csv/background_lines.csv new file mode 100755 index 0000000..274bd62 --- /dev/null +++ b/docs/_static/csv/background_lines.csv @@ -0,0 +1,8 @@ +"State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" +"Delhi","1.12.2009","8","17","1.29","3,665","2,409","1,000" +"Rajasthan","2.12.2009 to 19.12.2009","","","","","","" +"Gujarat","20.12.2009 to 3.1.2010","6","13","6.03","3,810","2,317","1,453" +"Maharashtra","4.01.2010 to 1.2.2010","13","26","1.27","5,680","9,027","4,153" +"Karnataka","2.2.2010 to 22.2.2010","11","19","1.80","5,741","3,658","3,183" +"Kerala","23.2.2010 to 11.3.2010","9","17","1.42","3,559","2,173","855" +"Total","","47","92","11.81","22,455","19,584","10,644" diff --git a/docs/_static/csv/table_areas.csv b/docs/_static/csv/table_areas.csv new file mode 100755 index 0000000..13b1dbc --- /dev/null +++ b/docs/_static/csv/table_areas.csv @@ -0,0 +1,11 @@ +"","One Withholding" +"Payroll Period","Allowance" +"Weekly","$71.15" +"Biweekly","142.31" +"Semimonthly","154.17" +"Monthly","308.33" +"Quarterly","925.00" +"Semiannually","1,850.00" +"Annually","3,700.00" +"Daily or Miscellaneous","14.23" +"(each day of the payroll period)","" diff --git a/docs/_static/pdf/background_lines.pdf b/docs/_static/pdf/background_lines.pdf new file mode 100755 index 0000000..f23d6b7 Binary files /dev/null and b/docs/_static/pdf/background_lines.pdf differ diff --git a/docs/_static/pdf/column_separators.pdf b/docs/_static/pdf/column_separators.pdf new file mode 100755 index 0000000..cecd7b6 Binary files /dev/null and b/docs/_static/pdf/column_separators.pdf differ diff --git a/docs/_static/pdf/copy_text.pdf b/docs/_static/pdf/copy_text.pdf new file mode 100644 index 0000000..39bce84 Binary files /dev/null and b/docs/_static/pdf/copy_text.pdf differ diff --git a/docs/_static/pdf/group_rows.pdf b/docs/_static/pdf/group_rows.pdf new file mode 100755 index 0000000..46cd236 Binary files /dev/null and b/docs/_static/pdf/group_rows.pdf differ diff --git a/docs/_static/pdf/short_lines.pdf b/docs/_static/pdf/short_lines.pdf new file mode 100755 index 0000000..5cab903 Binary files /dev/null and b/docs/_static/pdf/short_lines.pdf differ diff --git a/docs/_static/pdf/superscript.pdf b/docs/_static/pdf/superscript.pdf new file mode 100755 index 0000000..855a3bd Binary files /dev/null and b/docs/_static/pdf/superscript.pdf differ diff --git a/docs/_static/pdf/table_areas.pdf b/docs/_static/pdf/table_areas.pdf new file mode 100755 index 0000000..45b3de3 Binary files /dev/null and b/docs/_static/pdf/table_areas.pdf differ diff --git a/docs/_static/png/background_lines.png b/docs/_static/png/background_lines.png new file mode 100755 index 0000000..5165312 Binary files /dev/null and b/docs/_static/png/background_lines.png differ diff --git a/docs/_static/png/contour.png b/docs/_static/png/geometry_contour.png similarity index 100% rename from docs/_static/png/contour.png rename to docs/_static/png/geometry_contour.png diff --git a/docs/_static/png/intersection.png b/docs/_static/png/geometry_joint.png similarity index 100% rename from docs/_static/png/intersection.png rename to docs/_static/png/geometry_joint.png diff --git a/docs/_static/png/line.png b/docs/_static/png/geometry_line.png similarity index 100% rename from docs/_static/png/line.png rename to docs/_static/png/geometry_line.png diff --git a/docs/_static/png/table_span.png b/docs/_static/png/geometry_table.png similarity index 100% rename from docs/_static/png/table_span.png rename to docs/_static/png/geometry_table.png diff --git a/docs/_static/png/geometry_text.png b/docs/_static/png/geometry_text.png new file mode 100755 index 0000000..47b5608 Binary files /dev/null and b/docs/_static/png/geometry_text.png differ diff --git a/docs/_static/png/scale_1.png b/docs/_static/png/scale_1.png deleted file mode 100644 index e9023e0..0000000 Binary files a/docs/_static/png/scale_1.png and /dev/null differ diff --git a/docs/_static/png/scale_2.png b/docs/_static/png/scale_2.png deleted file mode 100644 index 798fd2a..0000000 Binary files a/docs/_static/png/scale_2.png and /dev/null differ diff --git a/docs/_static/png/short_lines.png b/docs/_static/png/short_lines.png new file mode 100755 index 0000000..395e834 Binary files /dev/null and b/docs/_static/png/short_lines.png differ diff --git a/docs/_static/png/short_lines_1.png b/docs/_static/png/short_lines_1.png new file mode 100644 index 0000000..adbcf4f Binary files /dev/null and b/docs/_static/png/short_lines_1.png differ diff --git a/docs/_static/png/short_lines_2.png b/docs/_static/png/short_lines_2.png new file mode 100755 index 0000000..8eed12f Binary files /dev/null and b/docs/_static/png/short_lines_2.png differ diff --git a/docs/_static/png/superscript.png b/docs/_static/png/superscript.png new file mode 100755 index 0000000..d798aa7 Binary files /dev/null and b/docs/_static/png/superscript.png differ diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 607e8d0..6502931 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -8,96 +8,314 @@ This page covers some of the more advanced configurations for :ref:`Stream >> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice + >>> tables = camelot.read_pdf('background_lines.pdf', mesh=True, process_background=True) + >>> tables[1].df - >>> manager = Pdf(Lattice(invert=True), 'lines_in_background_1.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] +.. csv-table:: + :file: ../_static/csv/background_lines.csv Plot geometry ------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/us-007/us-007.pdf +_static/pdf/foo.pdf + +:: + + >>> camelot.plot_geometry('foo.pdf', geometry_type='text') + +.. figure:: ../_static/png/geometry_text.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +:: + + >>> camelot.plot_geometry('foo.pdf', geometry_type='line') + +.. figure:: ../_static/png/geometry_line.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='joint') + +.. figure:: ../_static/png/geometry_joint.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='contour') + +.. figure:: ../_static/png/geometry_contour.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left + +:: + + >>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='table') + +.. figure:: ../_static/png/geometry_table.png + :height: 674 + :width: 1366 + :scale: 50% + :align: left You can call Lattice with debug={'line', 'intersection', 'contour', 'table'}, and call `debug_plot()` which will generate an image like the ones on this page, with the help of which you can modify various parameters. See :doc:`API doc ` for more information. Specify table areas ------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/us-007/us-007.pdf +_static/pdf/table_areas.pdf + +:: + + >>> tables = camelot.read_pdf('table_areas.pdf', table_areas=['316,499,566,337']) + >>> tables[0].df + +.. csv-table:: + :file: ../_static/csv/table_areas.csv Specify column separators ------------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/m27/m27.pdf +_static/pdf/column_separators.pdf + +:: + + >>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683']) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "LICENSE","","","","PREMISE","","","","","" + "NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" + "...","...","...","...","...","...","...","...","...","..." Split text along separators --------------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/m27/m27.pdf +_static/pdf/column_separators.pdf + +:: + + >>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683'], split_text=True) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","..." + "LICENSE","","","","PREMISE","","","","","" + "NUMBER","TYPE","DBA NAME","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" + "...","...","...","...","...","...","...","...","...","..." Flag subscripts and superscripts -------------------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/superscript/superscript.pdf +_static/pdf/superscript.pdf + +.. figure:: ../_static/png/superscript.png + :align: left + +:: + + >>> tables = camelot.read_pdf('superscript.pdf', flag_size=True) + >>> tables[0].df + +.. csv-table:: + + "...","...","...","...","...","...","...","...","...","...","..." + "Karnataka","22.44","19.59","-","-","2.86","1.22","-","0.89","-","0.69" + "Kerala","29.03","24.912","-","-","4.11","1.77","-","0.48","-","1.45" + "Madhya Pradesh","27.13","23.57","-","-","3.56","0.38","-","1.86","-","1.28" + "...","...","...","...","...","...","...","...","...","...","..." Control how text is grouped into rows ------------------------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/mexican_towns/mexican_towns.pdf +_static/pdf/group_rows.pdf + +:: + + >>> tables = camelot.read_pdf('group_rows.pdf') + >>> tables[0].df + +.. csv-table:: + + "Clave","","Clave","","","Clave","" + "","Nombre Entidad","","","Nombre Municipio","","Nombre Localidad" + "Entidad","","Municipio","","","Localidad","" + "01","Aguascalientes","001","Aguascalientes","","0094","Granja Adelita" + "01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" + "01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" + +:: + + >>> tables = camelot.read_pdf('group_rows.pdf', row_close_tol=10) + >>> tables[0].df + +.. csv-table:: + + "Clave","Nombre Entidad","Clave","","Nombre Municipio","Clave","Nombre Localidad" + "Entidad","","Municipio","","","Localidad","" + "01","Aguascalientes","001","Aguascalientes","","0094","Granja Adelita" + "01","Aguascalientes","001","Aguascalientes","","0096","Agua Azul" + "01","Aguascalientes","001","Aguascalientes","","0100","Rancho Alegre" Detect short lines ------------------ -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/lattice/row_span/row_span.pdf +_static/pdf/short_lines.pdf The scale parameter is used to determine the length of the structuring element used for morphological transformations. The length of vertical and horizontal structuring elements are found by dividing the image's height and width respectively, by `scale`. Large `scale` will lead to a smaller structuring element, which means that smaller lines will be detected. The default value for scale is 15. -Let's consider this pdf file. +.. figure:: ../_static/png/short_lines.png + :align: left -.. .. _this: insert link for row_span_1.pdf +:: -.. image:: ../_static/png/scale_1.png - :height: 674 - :width: 1366 - :scale: 50% + >>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table') + +.. figure:: ../_static/png/short_lines_1.png :align: left Clearly, it couldn't detected those small lines in the lower left part. Therefore, we need to increase the value of scale. Let's try a value of 40. -.. image:: ../_static/png/scale_2.png - :height: 674 - :width: 1366 - :scale: 50% +:: + + >>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table', line_size_scaling=40) + +.. figure:: ../_static/png/short_lines_2.png :align: left +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","2400","All ...","","","","" + "Clinical Examination","","","","","","" + "History of morbidity","","","","","","" + "Diet survey","1200","All ...","","","","" + "Blood Pressure #","2400","Men (≥ 18yrs)","10%","95%","20%","1728" + "","","Women (≥ 18 yrs)","","","","1728" + "Fasting blood glucose","2400","Men (≥ 18 yrs)","5%","95%","20%","1825" + "","","Women (≥ 18 yrs)","","","","1825" + "Knowledge &Practices on HTN &DM","2400","Men (≥ 18 yrs)","-","-","-","1728" + "","2400","Women (≥ 18 yrs)","-","-","-","1728" + +beware + Voila! It detected the smaller lines. +Shift text in spanning cells +---------------------------- + +in order + +_static/pdf/short_lines.pdf + +.. figure:: ../_static/png/short_lines.png + :align: left + +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['']) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","","","","","","" + "Clinical Examination","2400","","All ...","","","" + "History of morbidity","","","","","","" + "Diet survey","1200","","All ...","","","" + "","","Men (≥ 18yrs)","","","","1728" + "Blood Pressure #","2400","Women (≥ 18 yrs)","10%","95%","20%","1728" + "","","Men (≥ 18 yrs)","","","","1825" + "Fasting blood glucose","2400","Women (≥ 18 yrs)","5%","95%","20%","1825" + "Knowledge &Practices on HTN &","2400","Men (≥ 18 yrs)","-","-","-","1728" + "DM","2400","Women (≥ 18 yrs)","-","-","-","1728" + +:: + + >>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['r', 'b']) + >>> tables[0].df + +.. csv-table:: + + "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" + "Anthropometry","","","","","","" + "Clinical Examination","","","","","","" + "History of morbidity","2400","","","","","All ..." + "Diet survey","1200","","","","","All ..." + "","","Men (≥ 18yrs)","","","","1728" + "Blood Pressure #","2400","Women (≥ 18 yrs)","10%","95%","20%","1728" + "","","Men (≥ 18 yrs)","","","","1825" + "Fasting blood glucose","2400","Women (≥ 18 yrs)","5%","95%","20%","1825" + "","2400","Men (≥ 18 yrs)","-","-","-","1728" + "Knowledge &Practices on HTN &DM","2400","Women (≥ 18 yrs)","-","-","-","1728" Copy text in spanning cells --------------------------- -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/lattice/row_span/row_span.pdf +in order + +_static/pdf/copy_text.pdf In the file used above, you can see that some cells spanned a lot of rows, `fill` just copies the same value to all rows/columns of a spanning cell. You can apply fill horizontally, vertically or both. Let us fill the output for the file we used above, vertically. :: - >>> from camelot.pdf import Pdf - >>> from camelot.lattice import Lattice + >>> camelot.read_pdf('copy_text.pdf', mesh=True) - >>> manager = Pdf(Lattice(fill=['v'], scale=40), 'row_span_1.pdf') - >>> tables = manager.extract() - >>> print tables['page-1']['table-1']['data'] +.. csv-table:: -Shift text in spanning cells ----------------------------- + "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." + "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." + "2","Maharashtra","Beed","i. Dengue & Chikungunya i","11","0","03/01/14","04/01/14","Under control","..." + "3","Odisha","Kalahandi","iii. Food Poisoning","42","0","02/01/14","03/01/14","Under control","..." + "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." + "","","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." + "","","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." -https://github.com/socialcopsdev/camelot/blob/docs/docs/benchmark/stream/m27/m27.pdf \ No newline at end of file +:: + + >>> camelot.read_pdf('copy_text.pdf', mesh=True, copy_text=['v']) + +.. csv-table:: + + "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." + "1","Kerala","Kollam","i. Food Poisoning","19","0","31/12/13","03/01/14","Under control","..." + "2","Maharashtra","Beed","i. Dengue & Chikungunya i","11","0","03/01/14","04/01/14","Under control","..." + "3","Odisha","Kalahandi","iii. Food Poisoning","42","0","02/01/14","03/01/14","Under control","..." + "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." + "4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." + "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." \ No newline at end of file diff --git a/docs/user/how-it-works.rst b/docs/user/how-it-works.rst index f9ced07..491a8d8 100644 --- a/docs/user/how-it-works.rst +++ b/docs/user/how-it-works.rst @@ -39,7 +39,7 @@ Let's see how Lattice processes the `second page of this PDF`_, step-by-step. 1. Line segments are detected. -.. image:: ../_static/png/line.png +.. image:: ../_static/png/geometry_line.png :height: 674 :width: 1366 :scale: 50% @@ -49,7 +49,7 @@ Let's see how Lattice processes the `second page of this PDF`_, step-by-step. .. _and: https://en.wikipedia.org/wiki/Logical_conjunction -.. image:: ../_static/png/intersection.png +.. image:: ../_static/png/geometry_joint.png :height: 674 :width: 1366 :scale: 50% @@ -59,7 +59,7 @@ Let's see how Lattice processes the `second page of this PDF`_, step-by-step. .. _or: https://en.wikipedia.org/wiki/Logical_disjunction -.. image:: ../_static/png/contour.png +.. image:: ../_static/png/geometry_contour.png :height: 674 :width: 1366 :scale: 50% @@ -75,7 +75,7 @@ Let's see how Lattice processes the `second page of this PDF`_, step-by-step. 5. Spanning cells are detected using the line segments and line intersections. -.. image:: ../_static/png/table_span.png +.. image:: ../_static/png/geometry_table.png :height: 674 :width: 1366 :scale: 50%