Update docs

pull/2/head
Vinayak Mehta 2018-09-23 14:04:21 +05:30
parent 959a252aa3
commit a70befe528
13 changed files with 93 additions and 121 deletions

View File

@ -12,7 +12,7 @@
<pre> <pre>
>>> import camelot >>> import camelot
>>> tables = camelot.read_pdf('foo.pdf', mesh=True) >>> tables = camelot.read_pdf('foo.pdf')
>>> tables >>> tables
&lt;TableList tables=1&gt; &lt;TableList tables=1&gt;
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html

View File

@ -82,7 +82,7 @@ def cli(ctx, *args, **kwargs):
@click.argument("filepath", type=click.Path(exists=True)) @click.argument("filepath", type=click.Path(exists=True))
@pass_config @pass_config
def lattice(c, *args, **kwargs): def lattice(c, *args, **kwargs):
"""Use lines between text to generate table.""" """Use lines between text to parse table."""
conf = c.config conf = c.config
pages = conf.pop("pages") pages = conf.pop("pages")
output = conf.pop("output") output = conf.pop("output")
@ -127,7 +127,7 @@ def lattice(c, *args, **kwargs):
@click.argument("filepath", type=click.Path(exists=True)) @click.argument("filepath", type=click.Path(exists=True))
@pass_config @pass_config
def stream(c, *args, **kwargs): def stream(c, *args, **kwargs):
"""Use spaces between text to generate table.""" """Use spaces between text to parse table."""
conf = c.config conf = c.config
pages = conf.pop("pages") pages = conf.pop("pages")
output = conf.pop("output") output = conf.pop("output")

View File

@ -85,6 +85,10 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
if flavor not in ['lattice', 'stream']:
raise NotImplementedError("Unknown flavor specified."
" Use either 'lattice' or 'stream'")
validate_input(kwargs, flavor=flavor) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)

View File

@ -21,7 +21,7 @@ logger = setup_logging(__name__)
class Lattice(BaseParser): class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text """Lattice method of parsing looks for lines between text
to generate table. to parse table.
Parameters Parameters
---------- ----------

View File

@ -16,7 +16,7 @@ logger = setup_logging(__name__)
class Stream(BaseParser): class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text """Stream method of parsing looks for spaces between text
to generate table. to parse table.
If you want to specify columns when specifying multiple table If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal. areas, make sure that the length of both lists are equal.

View File

@ -8,7 +8,6 @@ API Reference
Main Interface Main Interface
-------------- --------------
.. autofunction:: camelot.read_pdf .. autofunction:: camelot.read_pdf
.. autofunction:: camelot.plot_geometry
Lower-Level Classes Lower-Level Classes
------------------- -------------------

View File

@ -33,7 +33,7 @@ Release v\ |version|. (:ref:`Installation <install>`)
:: ::
>>> import camelot >>> import camelot
>>> tables = camelot.read_pdf('foo.pdf', mesh=True) >>> tables = camelot.read_pdf('foo.pdf')
>>> tables >>> tables
<TableList tables=1> <TableList tables=1>
>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html

View File

@ -3,7 +3,7 @@
Advanced Usage Advanced Usage
============== ==============
This page covers some of the more advanced configurations for :ref:`Stream <stream>` and :ref:`Lattice <lattice>`. This page covers some of the more advanced configurations for :ref:`Lattice <lattice>` and :ref:`Stream <stream>`.
Process background lines Process background lines
------------------------ ------------------------
@ -21,7 +21,7 @@ To process background lines, you can pass ``process_background=True``.
:: ::
>>> tables = camelot.read_pdf('background_lines.pdf', mesh=True, process_background=True) >>> tables = camelot.read_pdf('background_lines.pdf', process_background=True)
>>> tables[1].df >>> tables[1].df
.. csv-table:: .. csv-table::
@ -30,9 +30,9 @@ To process background lines, you can pass ``process_background=True``.
Plot geometry Plot geometry
------------- -------------
You can use the :meth:`plot_geometry() <camelot.plot_geometry>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. You can use a :class:`table <camelot.core.Table>` object's :meth:`plot() <camelot.core.TableList.plot>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
The following geometries are available for plotting. You can pass them to the :meth:`plot_geometry() <camelot.plot_geometry>` method with the ``geometry_type`` keyword argument, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot. The following geometries are available for plotting. You can pass them to the :meth:`plot() <camelot.core.TableList.plot>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry.
- 'text' - 'text'
- 'table' - 'table'
@ -40,22 +40,26 @@ The following geometries are available for plotting. You can pass them to the :m
- 'line' - 'line'
- 'joint' - 'joint'
.. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``mesh=True``. .. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
.. warning:: By default, :meth:`plot_geometry() <camelot.plot_geometry>` will use the first page of the PDF. Since this method is useful only for debugging, it makes sense to use it for one page at a time. If you pass a page range to this method, multiple plots will be generated one by one, a new one popping up as you close the previous one. To abort, you can use ``Ctrl + C``. ::
>>> tables = camelot.read_pdf('foo.pdf')
>>> tables
<TableList n=1>
.. _geometry_text: .. _geometry_text:
text text
^^^^ ^^^^
Passing ``geometry_type=text`` creates a plot for all the text present on a PDF page. Let's plot all the text present on the table's PDF page.
:: ::
>>> camelot.plot_geometry('foo.pdf', geometry_type='text') >>> tables[0].plot('text')
.. figure:: ../_static/png/geometry_text.png .. figure:: ../_static/png/geometry_text.png
:height: 674 :height: 674
@ -64,20 +68,20 @@ Passing ``geometry_type=text`` creates a plot for all the text present on a PDF
:alt: A plot of all text on a PDF page :alt: A plot of all text on a PDF page
:align: left :align: left
This, as we shall later see, is very helpful with :ref:`Stream <stream>`, for noting table areas and column separators, in case Stream cannot guess them correctly. This, as we shall later see, is very helpful with :ref:`Stream <stream>`, for noting table areas and column separators, in case Stream does not guess them correctly.
.. note:: As you can see in the image above, the *x-y* coordinates change as you move your mouse cursor, which can help you note coordinates. .. note:: The *x-y* coordinates shown aboe change as you move your mouse cursor on the image, which can help you note coordinates.
.. _geometry_table: .. _geometry_table:
table table
^^^^^ ^^^^^
Passing ``geometry_type=table`` creates a plot for tables detected on a PDF page. This geometry, along with contour, line and joint is useful for debugging and improving the parsing output, as we shall see later. Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the parsing output, in case the table wasn't detected correctly. More on that later.
:: ::
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='table') >>> tables[0].plot('table')
.. figure:: ../_static/png/geometry_table.png .. figure:: ../_static/png/geometry_table.png
:height: 674 :height: 674
@ -86,16 +90,18 @@ Passing ``geometry_type=table`` creates a plot for tables detected on a PDF page
:alt: A plot of all tables on a PDF page :alt: A plot of all tables on a PDF page
:align: left :align: left
The table is perfect!
.. _geometry_contour: .. _geometry_contour:
contour contour
^^^^^^^ ^^^^^^^
Passing ``geometry_type=contour`` creates a plot for table boundaries detected on a PDF page. Now, let's plot all table boundaries present on the table's PDF page.
:: ::
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='contour') >>> tables[0].plot('contour')
.. figure:: ../_static/png/geometry_contour.png .. figure:: ../_static/png/geometry_contour.png
:height: 674 :height: 674
@ -109,11 +115,11 @@ Passing ``geometry_type=contour`` creates a plot for table boundaries detected o
line line
^^^^ ^^^^
Passing ``geometry_type=line`` creates a plot for lines detected on a PDF page. Cool, let's plot all line segments present on the table's PDF page.
:: ::
>>> camelot.plot_geometry('foo.pdf', geometry_type='line') >>> tables[0].plot('line')
.. figure:: ../_static/png/geometry_line.png .. figure:: ../_static/png/geometry_line.png
:height: 674 :height: 674
@ -127,11 +133,11 @@ Passing ``geometry_type=line`` creates a plot for lines detected on a PDF page.
joint joint
^^^^^ ^^^^^
Passing ``geometry_type=joint`` creates a plot for line intersections detected on a PDF page. Finally, let's plot all line intersections present on the table's PDF page.
:: ::
>>> camelot.plot_geometry('foo.pdf', mesh=True, geometry_type='joint') >>> tables[0].plot('joint')
.. figure:: ../_static/png/geometry_joint.png .. figure:: ../_static/png/geometry_joint.png
:height: 674 :height: 674
@ -143,7 +149,7 @@ Passing ``geometry_type=joint`` creates a plot for line intersections detected o
Specify table areas Specify table areas
------------------- -------------------
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as this `PDF <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the left-top and right-bottom coordinates of the table. Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the left-top and right-bottom coordinates of the table.
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument. Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
@ -151,7 +157,7 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
:: ::
>>> tables = camelot.read_pdf('table_areas.pdf', table_areas=['316,499,566,337']) >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -160,19 +166,19 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
Specify column separators Specify column separators
------------------------- -------------------------
In cases like this `PDF <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by :ref:`plotting the text <geometry_text>` on the page. In cases like `these <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by :ref:`plotting the text <geometry_text>` on the page.
You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument. You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument.
In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and there is a need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices. In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and there is a need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.
If you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table (since you can see by looking at the table that Camelot will be able to get it perfectly!), you can pass an empty string for the second table in the column separators' list, like this, ``columns=['10,120,200,400', '']``. For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list, like this, ``columns=['10,120,200,400', '']``.
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out! Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
:: ::
>>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683']) >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -182,7 +188,7 @@ Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_
"NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES" "NUMBER TYPE DBA NAME","","","LICENSEE NAME","ADDRESS","CITY","ST","ZIP","PHONE NUMBER","EXPIRES"
"...","...","...","...","...","...","...","...","...","..." "...","...","...","...","...","...","...","...","...","..."
Ah! Since `PDFMiner <https://euske.github.io/pdfminer/>`_ merged the strings, "NUMBER", "TYPE" and "DBA NAME", all of them were assigned to the same cell. Let's see how we can fix this in the next section. Ah! Since `PDFMiner <https://euske.github.io/pdfminer/>`_ merged the strings, "NUMBER", "TYPE" and "DBA NAME"; all of them were assigned to the same cell. Let's see how we can fix this in the next section.
Split text along separators Split text along separators
--------------------------- ---------------------------
@ -191,7 +197,7 @@ To deal with cases like the output from the previous section, you can pass ``spl
:: ::
>>> tables = camelot.read_pdf('column_separators.pdf', columns=['72,95,209,327,442,529,566,606,683'], split_text=True) >>> tables = camelot.read_pdf('column_separators.pdf', flavor='stream', columns=['72,95,209,327,442,529,566,606,683'], split_text=True)
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -204,13 +210,13 @@ To deal with cases like the output from the previous section, you can pass ``spl
Flag superscripts and subscripts Flag superscripts and subscripts
-------------------------------- --------------------------------
There might be cases where you want to differentiate between the text and superscripts and subscripts, like this `PDF <../_static/pdf/superscript.pdf>`_. There might be cases where you want to differentiate between the text, and superscripts or subscripts, like this `PDF <../_static/pdf/superscript.pdf>`_.
.. figure:: ../_static/png/superscript.png .. figure:: ../_static/png/superscript.png
:alt: A PDF with superscripts :alt: A PDF with superscripts
:align: left :align: left
In this case, the text that `other tools`_ return, will be ``24.912``. This is harmless as long as there is that decimal point involved. When it isn't there, you'll be left wondering why the results of your data analysis were 10x bigger! In this case, the text that `other tools`_ return, will be ``24.912``. This is harmless as long as there is that decimal point involved. But when it isn't there, you'll be left wondering why the results of your data analysis were 10x bigger!
You can solve this by passing ``flag_size=True``, which will enclose the superscripts and subscripts with ``<s></s>``, based on font size, as shown below. You can solve this by passing ``flag_size=True``, which will enclose the superscripts and subscripts with ``<s></s>``, based on font size, as shown below.
@ -218,7 +224,7 @@ You can solve this by passing ``flag_size=True``, which will enclose the supersc
:: ::
>>> tables = camelot.read_pdf('superscript.pdf', flag_size=True) >>> tables = camelot.read_pdf('superscript.pdf', flavor='stream', flag_size=True)
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -236,7 +242,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
:: ::
>>> tables = camelot.read_pdf('group_rows.pdf') >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream')
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -250,7 +256,7 @@ You can pass ``row_close_tol=<+int>`` to group the rows closer together, as show
:: ::
>>> tables = camelot.read_pdf('group_rows.pdf', row_close_tol=10) >>> tables = camelot.read_pdf('group_rows.pdf', flavor='stream', row_close_tol=10)
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -266,11 +272,11 @@ Detect short lines
There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15. There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
As you can already guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected. As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines. .. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
Here's one `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15. Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
.. figure:: ../_static/png/short_lines.png .. figure:: ../_static/png/short_lines.png
:alt: A PDF table with short lines :alt: A PDF table with short lines
@ -280,7 +286,8 @@ Let's :ref:`plot the table <geometry_table>` for this PDF.
:: ::
>>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table') >>> tables = camelot.read_pdf('short_lines.pdf')
>>> tables[0].plot('table')
.. figure:: ../_static/png/short_lines_1.png .. figure:: ../_static/png/short_lines_1.png
:alt: A plot of the PDF table with short lines :alt: A plot of the PDF table with short lines
@ -290,17 +297,17 @@ Clearly, the smaller lines separating the headers, couldn't be detected. Let's t
:: ::
>>> camelot.plot_geometry('short_lines.pdf', mesh=True, geometry_type='table', line_size_scaling=40) >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
>>> tables[0].plot('table')
.. figure:: ../_static/png/short_lines_2.png .. figure:: ../_static/png/short_lines_2.png
:alt: An improved plot of the PDF table with short lines :alt: An improved plot of the PDF table with short lines
:align: left :align: left
Voila! Camelot can now see those lines. Let's use this value in :meth:`read_pdf() <camelot.read_pdf>` and get our table. Voila! Camelot can now see those lines. Let's get our table.
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40)
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -332,7 +339,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['']) >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -353,7 +360,7 @@ No surprises there, it did remain in place (observe the strings "2400" and "All
:: ::
>>> tables = camelot.read_pdf('short_lines.pdf', mesh=True, line_size_scaling=40, shift_text=['r', 'b']) >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -381,7 +388,7 @@ Let's try it out on this `PDF <../_static/pdf/copy_text.pdf>`__. First, let's ch
:: ::
>>> tables = camelot.read_pdf('copy_text.pdf', mesh=True) >>> tables = camelot.read_pdf('copy_text.pdf')
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::
@ -398,7 +405,7 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in
:: ::
>>> tables = camelot.read_pdf('copy_text.pdf', mesh=True, copy_text=['v']) >>> tables = camelot.read_pdf('copy_text.pdf', copy_text=['v'])
>>> tables[0].df >>> tables[0].df
.. csv-table:: .. csv-table::

View File

@ -5,25 +5,21 @@ Command-line interface
Camelot comes with a command-line interface. Camelot comes with a command-line interface.
You can print the help for the interface, by typing ``camelot --help`` in your favorite terminal program, as shown below. You can print the help for the interface, by typing ``camelot --help`` in your favorite terminal program, as shown below. Furthermore, you can print the help for each command, by typing ``camelot <command> --help``, try it out!
:: ::
$ camelot --help $ camelot --help
Usage: camelot [OPTIONS] FILEPATH Usage: camelot [OPTIONS] COMMAND [ARGS]...
Options: Options:
--version Show the version and exit.
-p, --pages TEXT Comma-separated page numbers to parse. -p, --pages TEXT Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end Example: 1,3,4 or 1,4-end
-o, --output TEXT Output filepath. -o, --output TEXT Output filepath.
-f, --format [csv|json|excel|html] -f, --format [csv|json|excel|html]
Output file format. Output file format.
-z, --zip Whether or not to create a ZIP archive. -z, --zip Whether or not to create a ZIP archive.
-m, --mesh Whether or not to use Lattice method of
parsing. Stream is used by default.
-T, --table_area TEXT Table areas (x1,y1,x2,y2) to process.
x1, y1
-> left-top and x2, y2 -> right-bottom
-split, --split_text Whether or not to split text if it spans -split, --split_text Whether or not to split text if it spans
across multiple cells. across multiple cells.
-flag, --flag_size (inactive) Whether or not to flag text which -flag, --flag_size (inactive) Whether or not to flag text which
@ -32,47 +28,8 @@ You can print the help for the interface, by typing ``camelot --help`` in your f
-M, --margins <FLOAT FLOAT FLOAT>... -M, --margins <FLOAT FLOAT FLOAT>...
char_margin, line_margin, word_margin for char_margin, line_margin, word_margin for
PDFMiner. PDFMiner.
-C, --columns TEXT x-coordinates of column separators.
-r, --row_close_tol INTEGER Rows will be formed by combining text
vertically within this tolerance.
-c, --col_close_tol INTEGER Columns will be formed by combining text
horizontally within this tolerance.
-back, --process_background (with --mesh) Whether or not to process
lines that are in background.
-scale, --line_size_scaling INTEGER
(with --mesh) Factor by which the page
dimensions will be divided to get smallest
length of detected lines.
-copy, --copy_text [h|v] (with --mesh) Specify direction in which
text will be copied over in a spanning cell.
-shift, --shift_text [|l|r|t|b] (with --mesh) Specify direction in which
text in a spanning cell should flow.
-l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to
merge close vertical lines and close
horizontal lines.
-j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to
decide whether the detected lines and points
lie close to each other.
-block, --threshold_blocksize INTEGER
(with --mesh) For adaptive thresholding,
size of a pixel neighborhood that is used to
calculate a threshold value for the pixel:
3, 5, 7, and so on.
-const, --threshold_constant INTEGER
(with --mesh) For adaptive thresholding,
constant subtracted from the mean or
weighted mean.
Normally, it is positive but
may be zero or negative as well.
-I, --iterations INTEGER (with --mesh) Number of times for
erosion/dilation is applied.
-G, --geometry_type [text|table|contour|joint|line]
Plot geometry found on pdf page for
debugging.
text: Plot text objects. (Useful to get
table_area and columns coordinates)
table: Plot parsed table.
contour (with --mesh): Plot detected rectangles.
joint (with --mesh): Plot detected line intersections.
line (with --mesh): Plot detected lines.
--help Show this message and exit. --help Show this message and exit.
Commands:
lattice Use lines between text to parse table.
stream Use spaces between text to parse table.

View File

@ -20,7 +20,7 @@ It is built on top of PDFMiner's functionality of grouping characters on a page
.. _margins: https://euske.github.io/pdfminer/#tools .. _margins: https://euske.github.io/pdfminer/#tools
.. note:: By default, Stream treats the whole PDF page as a table. Automatic table detection for Stream is `in the works`_. .. note:: By default, Stream treats the whole PDF page as a table, which isn't ideal when there are more than two tables on a page with different number of columns. Automatic table detection for Stream is `in the works`_.
.. _in the works: https://github.com/socialcopsdev/camelot/issues/102 .. _in the works: https://github.com/socialcopsdev/camelot/issues/102
@ -29,13 +29,13 @@ It is built on top of PDFMiner's functionality of grouping characters on a page
Lattice Lattice
------- -------
Lattice is more deterministic in nature, and does not rely on guesses. It can be used to parse tables that have demarcated lines between cells. Lattice is more deterministic in nature, and does not rely on guesses. It can be used to parse tables that have demarcated lines between cells, and can automatically parse multiple tables present on a page.
It starts by converting the PDF page to an image using ghostscript and then processing it to get horizontal and vertical line segments by applying a set of morphological transformations (erosion and dilation) using OpenCV. It starts by converting the PDF page to an image using ghostscript and then processing it to get horizontal and vertical line segments by applying a set of morphological transformations (erosion and dilation) using OpenCV.
Let's see how Lattice processes the `second page of this PDF`_, step-by-step. Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
.. _second page of this PDF: ../_static/pdf/us-030.pdf .. _this PDF: ../_static/pdf/us-030.pdf
1. Line segments are detected. 1. Line segments are detected.

View File

@ -8,16 +8,20 @@ This part of the documentation covers the installation of Camelot. First, you'll
.. _tk: https://packages.ubuntu.com/trusty/python-tk .. _tk: https://packages.ubuntu.com/trusty/python-tk
.. _ghostscript: https://www.ghostscript.com/ .. _ghostscript: https://www.ghostscript.com/
These can be installed using your system's package manager. If you use Ubuntu, run the following: These can be installed using your system's package manager. You can run the following based on your OS.
::
$ sudo apt install python-tk ghostscript For Ubuntu::
$ apt install python-tk ghostscript
For macOS::
$ brew install tcl-tk ghostscript
$ pip install camelot-py $ pip install camelot-py
------------------------ ------------------------
After installing the dependencies, you can simply use pip to install Camelot: After installing the dependencies, you can simply use pip to install Camelot::
::
$ pip install camelot-py $ pip install camelot-py

View File

@ -6,9 +6,9 @@ Introduction
The Camelot Project The Camelot Project
------------------- -------------------
The Portable Document Format (PDF) was born out of `The Camelot Project`_ when a need was felt for "a universal to communicate documents across a wide variety of machine configurations, operating systems and communication networks". The goal was to make these documents viewable on any display and printable on any modern printers. The invention of the `PostScript`_ page description language, which enabled the creation of fixed-layout flat documents (with text, fonts, graphics, images encapsulated), solved the problem. The Portable Document Format (PDF) was born out of `The Camelot Project`_ when a need was felt for "a universal to communicate documents across a wide variety of machine configurations, operating systems and communication networks". The goal was to make these documents viewable on any display and printable on any modern printers. The invention of the `PostScript`_ page description language, which enabled the creation of *fixed-layout* flat documents (with text, fonts, graphics, images encapsulated), solved the problem.
At a very high level, PostScript defines instructions, such as, "place this character at this x,y coordinate on a plane". Spaces can be *simulated* by placing characters relatively far apart. Similarly, tables can be *simulated* by placing characters (and words) in two-dimensional grids. A PDF viewer just takes these instructions and draws everything for the user to view. Since it's just characters on a plane, there is no table data structure which can be directly extracted and used for analysis! At a very high level, PostScript defines instructions, such as, "place this character at this x,y coordinate on a plane". Spaces can be *simulated* by placing characters relatively far apart. Extending from that, tables can be *simulated* by placing characters (which constitute words) in two-dimensional grids. A PDF viewer just takes these instructions and draws everything for the user to view. Since it's just characters on a plane, there is no table data structure which can be extracted and used for analysis!
Sadly, a lot of open data is given out as tables which are trapped inside PDF files. Sadly, a lot of open data is given out as tables which are trapped inside PDF files.
@ -17,13 +17,14 @@ Sadly, a lot of open data is given out as tables which are trapped inside PDF fi
Why another PDF Table Parsing library? Why another PDF Table Parsing library?
-------------------------------------- --------------------------------------
There are both open (`Tabula`_) and closed-source (`PDFTables`_, `smallpdf`_) tools that are used widely to extract tables from PDF files. They either give a nice output, or fail miserably. There is no in-between. This does not help most users, since everything in the real world, including PDF table extraction, is fuzzy. Which leads to creation of adhoc table extraction scripts for each different type of PDF that the user wants to parse. There are both open (`Tabula`_, `pdf-table-extract`_) and closed-source (`smallpdf`_, `PDFTables`_) tools that are widely used, to extract tables from PDF files. They either give a nice output, or fail miserably. There is no in-between. This is not helpful, since everything in the real world, including PDF table extraction, is fuzzy, leading to creation of adhoc table extraction scripts for each different type of PDF that the user wants to parse.
Camelot was created with the goal of offering its users complete control over table extraction. If the users are not able to get the desired output with the default configuration, they should be able to tweak the parameters and get the tables out! Camelot was created with the goal of offering its users complete control over table extraction. If the users are not able to get the desired output with the default configuration, they should be able to tweak it and get the job done!
Here is a `comparison`_ of Camelot's output with outputs from other PDF parsing libraries and tools. Here is a `comparison`_ of Camelot's output with outputs from other open-source PDF parsing libraries and tools.
.. _Tabula: http://tabula.technology/ .. _Tabula: http://tabula.technology/
.. _pdf-table-extract: https://github.com/ashima/pdf-table-extract
.. _PDFTables: https://pdftables.com/ .. _PDFTables: https://pdftables.com/
.. _Smallpdf: https://smallpdf.com .. _Smallpdf: https://smallpdf.com
.. _comparison: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools .. _comparison: https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools
@ -31,7 +32,7 @@ Here is a `comparison`_ of Camelot's output with outputs from other PDF parsing
What's in a name? What's in a name?
----------------- -----------------
As you can already guess, this library is named after `The Camelot Project`_. The image on the left is taken from `Monty Python and the Holy Grail`_. In the movie, it is the castle "Camelot" where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written was named after Monty Python. As you can already guess, this library is named after `The Camelot Project`_. Fun fact, "Camelot" is the name of the castle in `Monty Python and the Holy Grail`_, where Arthur leads his men, the Knights of the Round Table, and then sets off elsewhere after deciding that it is "a silly place". Interestingly, the language in which this library is written (Python) was named after Monty Python.
.. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf .. _The Camelot Project: http://www.planetpdf.com/planetpdf/pdfs/warnock_camelot.pdf
.. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail .. _Monty Python and the Holy Grail: https://en.wikipedia.org/wiki/Monty_Python_and_the_Holy_Grail

View File

@ -16,13 +16,13 @@ Begin by importing the Camelot module::
Now, let's try to read a PDF. You can check out the PDF used in this example, `here`_. Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that we will set the ``mesh`` keyword argument to ``True``. Now, let's try to read a PDF. You can check out the PDF used in this example, `here`_. Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that we will set the ``mesh`` keyword argument to ``True``.
.. note:: :ref:`Stream <stream>` is used by default. .. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
.. _here: ../_static/pdf/foo.pdf .. _here: ../_static/pdf/foo.pdf
:: ::
>>> tables = camelot.read_pdf('foo.pdf', mesh=True) >>> tables = camelot.read_pdf('foo.pdf')
>>> tables >>> tables
<TableList n=1> <TableList n=1>
@ -47,7 +47,7 @@ Let's print the parsing report.
'page': 1 'page': 1
} }
Woah! The accuracy is top-notch and whitespace is less, that means the table was parsed correctly (most probably). You can access the table as a pandas DataFrame by using the :class:`table <camelot.core.Table> object's` ``df`` property. Woah! The accuracy is top-notch and whitespace is less, that means the table was parsed correctly (most probably). You can access the table as a pandas DataFrame by using the :class:`table <camelot.core.Table>` object's ``df`` property.
:: ::
@ -64,7 +64,7 @@ Looks good! You can be export the table as a CSV file using its :meth:`to_csv()
This will export the table as a CSV file at the path specified. In this case, it is ``foo.csv`` in the current directory. This will export the table as a CSV file at the path specified. In this case, it is ``foo.csv`` in the current directory.
You can also export all tables at once, using the ``tables`` object's :meth:`export() <camelot.core.TableList.export>` method. You can also export all tables at once, using the :class:`tables <camelot.core.TableList>` object's :meth:`export() <camelot.core.TableList.export>` method.
:: ::
@ -72,11 +72,11 @@ You can also export all tables at once, using the ``tables`` object's :meth:`exp
This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``. This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``.
.. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP archive at your path with all the exported files. .. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.
.. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF file`_. .. note:: Camelot handles rotated PDF pages automatically. As an exercise, try to extract the table out of `this PDF`_.
.. _this PDF file: ../_static/pdf/rotated.pdf .. _this PDF: ../_static/pdf/rotated.pdf
Specify page numbers Specify page numbers
-------------------- --------------------