From d6628197551e1e0c5ba173a2baf7cb4a76bd69de Mon Sep 17 00:00:00 2001 From: Emmanuel Arias Date: Sun, 14 Oct 2018 21:00:33 -0300 Subject: [PATCH 1/4] Add usage example to cli --- camelot/cli.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/camelot/cli.py b/camelot/cli.py index e978a3c..9f4e038 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -177,3 +177,46 @@ def stream(c, *args, **kwargs): plt.show() else: tables.export(output, f=f, compress=compress) + + +@cli.command('examples') +def examples(*arg, **kwargs): + """Usage example""" + sample = """ + >>> import camelot + >>> tables = camelot.read_pdf('foo.pdf') + >>> tables + + >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html + >>> tables[0] + + >>> tables[0].parsing_report + { + 'accuracy': 99.02, + 'whitespace': 12.24, + 'order': 1, + 'page': 1 + } + >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html + >>> tables[0].df # get a pandas DataFrame! + + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | Cycle | KI (1/km) | Distance (mi) | Percent | | | | + | Name | | | Fuel Savings | | | | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | | | | Improved | Decreased | Eliminate | Decreased | + | | | | Speed | Accel | Stops | Idle | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | 2012_2| 3.30 | 1.3 | 5.9% | 9.5% | 29.2% | 17.4% | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | 2145_1| 0.68 | 11.2 | 2.4% | 0.1% | 9.5% | 2.7% | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | 4234_1| 0.59 | 58.7 | 8.5% | 1.3% | 8.5% | 3.3% | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | 2032_2| 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + | 4171_1| 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | + |-------|-----------|---------------|--------------|-----------|------------|-----------| + + """ + print(sample) From 2dc48f43d6e1a86a2716f7d2cbd6703e9a6a539f Mon Sep 17 00:00:00 2001 From: Emmanuel Arias Date: Tue, 16 Oct 2018 22:46:12 -0300 Subject: [PATCH 2/4] Add CLI documentation, clean cli example command --- camelot/cli.py | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index 9f4e038..e978a3c 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -177,46 +177,3 @@ def stream(c, *args, **kwargs): plt.show() else: tables.export(output, f=f, compress=compress) - - -@cli.command('examples') -def examples(*arg, **kwargs): - """Usage example""" - sample = """ - >>> import camelot - >>> tables = camelot.read_pdf('foo.pdf') - >>> tables - - >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html - >>> tables[0] -
- >>> tables[0].parsing_report - { - 'accuracy': 99.02, - 'whitespace': 12.24, - 'order': 1, - 'page': 1 - } - >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html - >>> tables[0].df # get a pandas DataFrame! - - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | Cycle | KI (1/km) | Distance (mi) | Percent | | | | - | Name | | | Fuel Savings | | | | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | | | | Improved | Decreased | Eliminate | Decreased | - | | | | Speed | Accel | Stops | Idle | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | 2012_2| 3.30 | 1.3 | 5.9% | 9.5% | 29.2% | 17.4% | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | 2145_1| 0.68 | 11.2 | 2.4% | 0.1% | 9.5% | 2.7% | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | 4234_1| 0.59 | 58.7 | 8.5% | 1.3% | 8.5% | 3.3% | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | 2032_2| 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - | 4171_1| 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | - |-------|-----------|---------------|--------------|-----------|------------|-----------| - - """ - print(sample) From 3ef50f6f8d828a4ff3980523ad110bcd56c361a1 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 14 Dec 2018 12:57:32 +0530 Subject: [PATCH 3/4] Fix cli.rst --- docs/user/cli.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/cli.rst b/docs/user/cli.rst index 81dd0bc..a7113f5 100644 --- a/docs/user/cli.rst +++ b/docs/user/cli.rst @@ -15,7 +15,7 @@ You can print the help for the interface by typing ``camelot --help`` in your fa Options: --version Show the version and exit. - -v, --verbose Verbose. + -q, --quiet TEXT Suppress logs and warnings. -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4 or 1,4-end. -pw, --password TEXT Password for decryption. From eb7be9c8e60ffb303b5befaa0fea69107ac41c61 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Fri, 14 Dec 2018 13:39:05 +0530 Subject: [PATCH 4/4] Add equivalent CLI examples --- docs/user/advanced.rst | 90 ++++++++++++++++++++++++++++++++++++++++ docs/user/quickstart.rst | 18 ++++++++ 2 files changed, 108 insertions(+) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index d2c8b35..37e8d01 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -24,6 +24,12 @@ To process background lines, you can pass ``process_background=True``. >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True) >>> tables[1].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -back background_lines.pdf + .. csv-table:: :file: ../_static/csv/background_lines.csv @@ -63,6 +69,12 @@ Let's plot all the text present on the table's PDF page. >>> camelot.plot(tables[0], kind='text') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -plot text foo.pdf + .. figure:: ../_static/png/plot_text.png :height: 674 :width: 1366 @@ -84,6 +96,12 @@ Let's plot the table (to see if it was detected correctly or not). This plot typ >>> camelot.plot(tables[0], kind='grid') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -plot grid foo.pdf + .. figure:: ../_static/png/plot_table.png :height: 674 :width: 1366 @@ -103,6 +121,12 @@ Now, let's plot all table boundaries present on the table's PDF page. >>> camelot.plot(tables[0], kind='contour') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -plot contour foo.pdf + .. figure:: ../_static/png/plot_contour.png :height: 674 :width: 1366 @@ -120,6 +144,12 @@ Cool, let's plot all line segments present on the table's PDF page. >>> camelot.plot(tables[0], kind='line') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -plot line foo.pdf + .. figure:: ../_static/png/plot_line.png :height: 674 :width: 1366 @@ -137,6 +167,12 @@ Finally, let's plot all line intersections present on the table's PDF page. >>> camelot.plot(tables[0], kind='joint') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -plot joint foo.pdf + .. figure:: ../_static/png/plot_joint.png :height: 674 :width: 1366 @@ -154,6 +190,12 @@ You can also visualize the textedges found on a page by specifying ``kind='texte >>> camelot.plot(tables[0], kind='textedge') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -plot textedge foo.pdf + .. figure:: ../_static/png/plot_textedge.png :height: 674 :width: 1366 @@ -175,6 +217,12 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337']) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -T 316,499,566,337 table_areas.pdf + .. csv-table:: :file: ../_static/csv/table_areas.csv @@ -196,6 +244,12 @@ Let's get back to the *x* coordinates we got from plotting the text that exists >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683']) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -C 72,95,209,327,442,529,566,606,683 column_separators.pdf + .. csv-table:: "...","...","...","...","...","...","...","...","...","..." @@ -215,6 +269,12 @@ To deal with cases like the output from the previous section, you can pass ``spl >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'], split_text=True) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot -split stream -C 72,95,209,327,442,529,566,606,683 column_separators.pdf + .. csv-table:: "...","...","...","...","...","...","...","...","...","..." @@ -242,6 +302,12 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc >>> tables = camelot.read_pdf('superscript.pdf', flavor='stream', flag_size=True) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot -flag stream superscript.pdf + .. csv-table:: "...","...","...","...","...","...","...","...","...","...","..." @@ -274,6 +340,12 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot stream -r 10 group_rows.pdf + .. csv-table:: "Clave","Nombre Entidad","Clave","","Nombre Municipio","Clave","Nombre Localidad" @@ -317,6 +389,12 @@ Clearly, the smaller lines separating the headers, couldn't be detected. Let's t >>> camelot.plot(tables[0], kind='grid') >>> plt.show() +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -scale 40 -plot grid short_lines.pdf + .. figure:: ../_static/png/short_lines_2.png :alt: An improved plot of the PDF table with short lines :align: left @@ -380,6 +458,12 @@ No surprises there — it did remain in place (observe the strings "2400" and "A >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b']) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -scale 40 -shift r -shift b short_lines.pdf + .. csv-table:: "Investigations","No. ofHHs","Age/Sex/Physiological Group","Preva-lence","C.I*","RelativePrecision","Sample sizeper State" @@ -425,6 +509,12 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in >>> tables = camelot.read_pdf('copy_text.pdf', copy_text=['v']) >>> tables[0].df +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot lattice -copy v copy_text.pdf + .. csv-table:: "Sl. No.","Name of State/UT","Name of District","Disease/ Illness","No. of Cases","No. of Deaths","Date of start of outbreak","Date of reporting","Current Status","..." diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index 5fb5bc0..d9d704a 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -70,6 +70,12 @@ You can also export all tables at once, using the :class:`tables >> tables.export('foo.csv', f='csv') +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot --format csv --output foo.csv lattice foo.pdf + This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``. .. note:: The :meth:`export() ` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files. @@ -85,6 +91,12 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp >>> camelot.read_pdf('your.pdf', pages='1,2,3') +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot --pages 1,2,3 lattice your.pdf + The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``. Reading encrypted PDFs @@ -98,6 +110,12 @@ To extract tables from encrypted PDF files you must provide a password when call >>> tables +.. tip:: + Here's how you can do the same with the :ref:`command-line interface `. + :: + + $ camelot --password userpass lattice foo.pdf + Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm. Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() `. This can been successfully achieved with third-party tools such as `QPDF`_.