[MRG] Make matplotlib optional (#190)
* Rename png files * Convert plot to PlotMethods class and update docs * Update test * Update setup.py and docs * Refactor PlotMethods * Make matplotlib optional * Raise ImportError in clipull/2/head
|
|
@ -72,7 +72,7 @@ $ conda install -c conda-forge camelot-py
|
||||||
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
|
After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
$ pip install camelot-py[all]
|
$ pip install camelot-py[cv]
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
### From the source code
|
### From the source code
|
||||||
|
|
@ -87,7 +87,7 @@ and install Camelot using pip:
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
$ cd camelot
|
$ cd camelot
|
||||||
$ pip install ".[all]"
|
$ pip install ".[cv]"
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from click import HelpFormatter
|
||||||
|
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .plotting import plot
|
from .plotting import PlotMethods
|
||||||
|
|
||||||
|
|
||||||
def _write_usage(self, prog, args='', prefix='Usage: '):
|
def _write_usage(self, prog, args='', prefix='Usage: '):
|
||||||
|
|
@ -26,3 +26,6 @@ handler = logging.StreamHandler()
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
# instantiate plot method
|
||||||
|
plot = PlotMethods()
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,14 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import matplotlib.pyplot as plt
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
except ImportError:
|
||||||
|
_HAS_MPL = False
|
||||||
|
else:
|
||||||
|
_HAS_MPL = True
|
||||||
|
|
||||||
from . import __version__
|
from . import __version__, read_pdf, plot
|
||||||
from .io import read_pdf
|
|
||||||
from .plotting import plot
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('camelot')
|
logger = logging.getLogger('camelot')
|
||||||
|
|
@ -82,7 +85,7 @@ def cli(ctx, *args, **kwargs):
|
||||||
@click.option('-I', '--iterations', default=0,
|
@click.option('-I', '--iterations', default=0,
|
||||||
help='Number of times for erosion/dilation will be applied.')
|
help='Number of times for erosion/dilation will be applied.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
|
type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
help='Plot elements found on PDF page for visual debugging.')
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
@click.argument('filepath', type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
|
|
@ -104,18 +107,23 @@ def lattice(c, *args, **kwargs):
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
||||||
tables = read_pdf(filepath, pages=pages, flavor='lattice',
|
|
||||||
suppress_warnings=suppress_warnings, **kwargs)
|
|
||||||
click.echo('Found {} tables'.format(tables.n))
|
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
if not _HAS_MPL:
|
||||||
plot(table, plot_type=plot_type)
|
raise ImportError('matplotlib is required for plotting.')
|
||||||
plt.show()
|
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output file path using --output')
|
raise click.UsageError('Please specify output file path using --output')
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output file format using --format')
|
raise click.UsageError('Please specify output file format using --format')
|
||||||
|
|
||||||
|
tables = read_pdf(filepath, pages=pages, flavor='lattice',
|
||||||
|
suppress_warnings=suppress_warnings, **kwargs)
|
||||||
|
click.echo('Found {} tables'.format(tables.n))
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
plot(table, kind=plot_type)
|
||||||
|
plt.show()
|
||||||
|
else:
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -130,7 +138,7 @@ def lattice(c, *args, **kwargs):
|
||||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||||
' used to combine text horizontally, to generate columns.')
|
' used to combine text horizontally, to generate columns.')
|
||||||
@click.option('-plot', '--plot_type',
|
@click.option('-plot', '--plot_type',
|
||||||
type=click.Choice(['text', 'table']),
|
type=click.Choice(['text', 'grid']),
|
||||||
help='Plot elements found on PDF page for visual debugging.')
|
help='Plot elements found on PDF page for visual debugging.')
|
||||||
@click.argument('filepath', type=click.Path(exists=True))
|
@click.argument('filepath', type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
|
|
@ -151,16 +159,21 @@ def stream(c, *args, **kwargs):
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs['columns'] = None if not columns else columns
|
||||||
|
|
||||||
tables = read_pdf(filepath, pages=pages, flavor='stream',
|
|
||||||
suppress_warnings=suppress_warnings, **kwargs)
|
|
||||||
click.echo('Found {} tables'.format(tables.n))
|
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
for table in tables:
|
if not _HAS_MPL:
|
||||||
plot(table, plot_type=plot_type)
|
raise ImportError('matplotlib is required for plotting.')
|
||||||
plt.show()
|
|
||||||
else:
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError('Please specify output file path using --output')
|
raise click.UsageError('Please specify output file path using --output')
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError('Please specify output file format using --format')
|
raise click.UsageError('Please specify output file format using --format')
|
||||||
|
|
||||||
|
tables = read_pdf(filepath, pages=pages, flavor='stream',
|
||||||
|
suppress_warnings=suppress_warnings, **kwargs)
|
||||||
|
click.echo('Found {} tables'.format(tables.n))
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
plot(table, kind=plot_type)
|
||||||
|
plt.show()
|
||||||
|
else:
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
|
||||||
|
|
@ -1,185 +1,179 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
try:
|
||||||
import matplotlib.patches as patches
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as patches
|
||||||
|
except ImportError:
|
||||||
|
_HAS_MPL = False
|
||||||
|
else:
|
||||||
|
_HAS_MPL = True
|
||||||
|
|
||||||
|
|
||||||
def plot(table, plot_type='text', filepath=None):
|
class PlotMethods(object):
|
||||||
"""Plot elements found on PDF page based on plot_type
|
def __call__(self, table, kind='text', filename=None):
|
||||||
specified, useful for debugging and playing with different
|
"""Plot elements found on PDF page based on kind
|
||||||
parameters to get the best output.
|
specified, useful for debugging and playing with different
|
||||||
|
parameters to get the best output.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table: Table
|
table: camelot.core.Table
|
||||||
A Camelot Table.
|
A Camelot Table.
|
||||||
plot_type : str, optional (default: 'text')
|
kind : str, optional (default: 'text')
|
||||||
{'text', 'table', 'contour', 'joint', 'line'}
|
{'text', 'grid', 'contour', 'joint', 'line'}
|
||||||
The element type for which a plot should be generated.
|
The element type for which a plot should be generated.
|
||||||
filepath: str, optional (default: None)
|
filepath: str, optional (default: None)
|
||||||
Absolute path for saving the generated plot.
|
Absolute path for saving the generated plot.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']:
|
if not _HAS_MPL:
|
||||||
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
|
raise ImportError('matplotlib is required for plotting.')
|
||||||
plot_type))
|
|
||||||
if plot_type == 'text':
|
|
||||||
fig = plot_text(table._text)
|
|
||||||
elif plot_type == 'table':
|
|
||||||
fig = plot_table(table)
|
|
||||||
elif plot_type == 'contour':
|
|
||||||
fig = plot_contour(table._image)
|
|
||||||
elif plot_type == 'joint':
|
|
||||||
fig = plot_joint(table._image)
|
|
||||||
elif plot_type == 'line':
|
|
||||||
fig = plot_line(table._segments)
|
|
||||||
if filepath:
|
|
||||||
plt.savefig(filepath)
|
|
||||||
return fig
|
|
||||||
|
|
||||||
|
if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']:
|
||||||
|
raise NotImplementedError("Stream flavor does not support kind='{}'".format(
|
||||||
|
kind))
|
||||||
|
|
||||||
def plot_text(text):
|
plot_method = getattr(self, kind)
|
||||||
"""Generates a plot for all text elements present
|
return plot_method(table)
|
||||||
on the PDF page.
|
|
||||||
|
|
||||||
Parameters
|
def text(self, table):
|
||||||
----------
|
"""Generates a plot for all text elements present
|
||||||
text : list
|
on the PDF page.
|
||||||
|
|
||||||
Returns
|
Parameters
|
||||||
-------
|
----------
|
||||||
fig : matplotlib.fig.Figure
|
table : camelot.core.Table
|
||||||
|
|
||||||
"""
|
Returns
|
||||||
fig = plt.figure()
|
-------
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
fig : matplotlib.fig.Figure
|
||||||
xs, ys = [], []
|
|
||||||
for t in text:
|
"""
|
||||||
xs.extend([t[0], t[2]])
|
fig = plt.figure()
|
||||||
ys.extend([t[1], t[3]])
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
ax.add_patch(
|
xs, ys = [], []
|
||||||
patches.Rectangle(
|
for t in table._text:
|
||||||
(t[0], t[1]),
|
xs.extend([t[0], t[2]])
|
||||||
t[2] - t[0],
|
ys.extend([t[1], t[3]])
|
||||||
t[3] - t[1]
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
|
t[2] - t[0],
|
||||||
|
t[3] - t[1]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
return fig
|
||||||
return fig
|
|
||||||
|
|
||||||
|
def grid(self, table):
|
||||||
|
"""Generates a plot for the detected table grids
|
||||||
|
on the PDF page.
|
||||||
|
|
||||||
def plot_table(table):
|
Parameters
|
||||||
"""Generates a plot for the detected tables
|
----------
|
||||||
on the PDF page.
|
table : camelot.core.Table
|
||||||
|
|
||||||
Parameters
|
Returns
|
||||||
----------
|
-------
|
||||||
table : camelot.core.Table
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
fig = plt.figure()
|
||||||
fig : matplotlib.fig.Figure
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
for row in table.cells:
|
||||||
|
for cell in row:
|
||||||
|
if cell.left:
|
||||||
|
ax.plot([cell.lb[0], cell.lt[0]],
|
||||||
|
[cell.lb[1], cell.lt[1]])
|
||||||
|
if cell.right:
|
||||||
|
ax.plot([cell.rb[0], cell.rt[0]],
|
||||||
|
[cell.rb[1], cell.rt[1]])
|
||||||
|
if cell.top:
|
||||||
|
ax.plot([cell.lt[0], cell.rt[0]],
|
||||||
|
[cell.lt[1], cell.rt[1]])
|
||||||
|
if cell.bottom:
|
||||||
|
ax.plot([cell.lb[0], cell.rb[0]],
|
||||||
|
[cell.lb[1], cell.rb[1]])
|
||||||
|
return fig
|
||||||
|
|
||||||
"""
|
def contour(self, table):
|
||||||
fig = plt.figure()
|
"""Generates a plot for all table boundaries present
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
on the PDF page.
|
||||||
for row in table.cells:
|
|
||||||
for cell in row:
|
|
||||||
if cell.left:
|
|
||||||
ax.plot([cell.lb[0], cell.lt[0]],
|
|
||||||
[cell.lb[1], cell.lt[1]])
|
|
||||||
if cell.right:
|
|
||||||
ax.plot([cell.rb[0], cell.rt[0]],
|
|
||||||
[cell.rb[1], cell.rt[1]])
|
|
||||||
if cell.top:
|
|
||||||
ax.plot([cell.lt[0], cell.rt[0]],
|
|
||||||
[cell.lt[1], cell.rt[1]])
|
|
||||||
if cell.bottom:
|
|
||||||
ax.plot([cell.lb[0], cell.rb[0]],
|
|
||||||
[cell.lb[1], cell.rb[1]])
|
|
||||||
return fig
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
def plot_contour(image):
|
Returns
|
||||||
"""Generates a plot for all table boundaries present
|
-------
|
||||||
on the PDF page.
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
Parameters
|
"""
|
||||||
----------
|
img, table_bbox = table._image
|
||||||
image : tuple
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
Returns
|
for t in table_bbox.keys():
|
||||||
-------
|
ax.add_patch(
|
||||||
fig : matplotlib.fig.Figure
|
patches.Rectangle(
|
||||||
|
(t[0], t[1]),
|
||||||
"""
|
t[2] - t[0],
|
||||||
img, table_bbox = image
|
t[3] - t[1],
|
||||||
fig = plt.figure()
|
fill=None,
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
edgecolor='red'
|
||||||
for t in table_bbox.keys():
|
)
|
||||||
ax.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(t[0], t[1]),
|
|
||||||
t[2] - t[0],
|
|
||||||
t[3] - t[1],
|
|
||||||
fill=None,
|
|
||||||
edgecolor='red'
|
|
||||||
)
|
)
|
||||||
)
|
ax.imshow(img)
|
||||||
ax.imshow(img)
|
return fig
|
||||||
return fig
|
|
||||||
|
|
||||||
|
def joint(self, table):
|
||||||
|
"""Generates a plot for all line intersections present
|
||||||
|
on the PDF page.
|
||||||
|
|
||||||
def plot_joint(image):
|
Parameters
|
||||||
"""Generates a plot for all line intersections present
|
----------
|
||||||
on the PDF page.
|
table : camelot.core.Table
|
||||||
|
|
||||||
Parameters
|
Returns
|
||||||
----------
|
-------
|
||||||
image : tuple
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
Returns
|
"""
|
||||||
-------
|
img, table_bbox = table._image
|
||||||
fig : matplotlib.fig.Figure
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
x_coord = []
|
||||||
|
y_coord = []
|
||||||
|
for k in table_bbox.keys():
|
||||||
|
for coord in table_bbox[k]:
|
||||||
|
x_coord.append(coord[0])
|
||||||
|
y_coord.append(coord[1])
|
||||||
|
ax.plot(x_coord, y_coord, 'ro')
|
||||||
|
ax.imshow(img)
|
||||||
|
return fig
|
||||||
|
|
||||||
"""
|
def line(self, table):
|
||||||
img, table_bbox = image
|
"""Generates a plot for all line segments present
|
||||||
fig = plt.figure()
|
on the PDF page.
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
x_coord = []
|
|
||||||
y_coord = []
|
|
||||||
for k in table_bbox.keys():
|
|
||||||
for coord in table_bbox[k]:
|
|
||||||
x_coord.append(coord[0])
|
|
||||||
y_coord.append(coord[1])
|
|
||||||
ax.plot(x_coord, y_coord, 'ro')
|
|
||||||
ax.imshow(img)
|
|
||||||
return fig
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
|
||||||
def plot_line(segments):
|
Returns
|
||||||
"""Generates a plot for all line segments present
|
-------
|
||||||
on the PDF page.
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
Parameters
|
"""
|
||||||
----------
|
fig = plt.figure()
|
||||||
segments : tuple
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
vertical, horizontal = table._segments
|
||||||
Returns
|
for v in vertical:
|
||||||
-------
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
fig : matplotlib.fig.Figure
|
for h in horizontal:
|
||||||
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
"""
|
return fig
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
vertical, horizontal = segments
|
|
||||||
for v in vertical:
|
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
|
||||||
for h in horizontal:
|
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
|
||||||
return fig
|
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 24 KiB |
|
Before Width: | Height: | Size: 8.1 KiB After Width: | Height: | Size: 8.1 KiB |
|
Before Width: | Height: | Size: 8.8 KiB After Width: | Height: | Size: 8.8 KiB |
|
Before Width: | Height: | Size: 64 KiB After Width: | Height: | Size: 64 KiB |
|
|
@ -30,12 +30,14 @@ To process background lines, you can pass ``process_background=True``.
|
||||||
Visual debugging
|
Visual debugging
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
You can use the :meth:`plot() <camelot.plotting.plot>` method to generate a `matplotlib <https://matplotlib.org/>`_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
.. note:: Visual debugging using ``plot()`` requires `matplotlib <https://matplotlib.org/>`_ which is an optional dependency. You can install it using ``$ pip install camelot-py[plot]``.
|
||||||
|
|
||||||
You can specify the type of element you want to plot using the ``plot_type`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported:
|
You can use the :class:`plot() <camelot.plotting.PlotMethods>` method to generate a `matplotlib <https://matplotlib.org/>`_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
||||||
|
|
||||||
|
You can specify the type of element you want to plot using the ``kind`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported:
|
||||||
|
|
||||||
- 'text'
|
- 'text'
|
||||||
- 'table'
|
- 'grid'
|
||||||
- 'contour'
|
- 'contour'
|
||||||
- 'line'
|
- 'line'
|
||||||
- 'joint'
|
- 'joint'
|
||||||
|
|
@ -50,8 +52,6 @@ Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__
|
||||||
>>> tables
|
>>> tables
|
||||||
<TableList n=1>
|
<TableList n=1>
|
||||||
|
|
||||||
.. _geometry_text:
|
|
||||||
|
|
||||||
text
|
text
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
|
|
@ -59,10 +59,10 @@ Let's plot all the text present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot(tables[0], plot_type='text')
|
>>> camelot.plot(tables[0], kind='text')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_text.png
|
.. figure:: ../_static/png/plot_text.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -73,8 +73,6 @@ This, as we shall later see, is very helpful with :ref:`Stream <stream>` for not
|
||||||
|
|
||||||
.. note:: The *x-y* coordinates shown above change as you move your mouse cursor on the image, which can help you note coordinates.
|
.. note:: The *x-y* coordinates shown above change as you move your mouse cursor on the image, which can help you note coordinates.
|
||||||
|
|
||||||
.. _geometry_table:
|
|
||||||
|
|
||||||
table
|
table
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
|
|
@ -82,10 +80,10 @@ Let's plot the table (to see if it was detected correctly or not). This plot typ
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot(tables[0], plot_type='table')
|
>>> camelot.plot(tables[0], kind='table')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_table.png
|
.. figure:: ../_static/png/plot_table.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -94,8 +92,6 @@ Let's plot the table (to see if it was detected correctly or not). This plot typ
|
||||||
|
|
||||||
The table is perfect!
|
The table is perfect!
|
||||||
|
|
||||||
.. _geometry_contour:
|
|
||||||
|
|
||||||
contour
|
contour
|
||||||
^^^^^^^
|
^^^^^^^
|
||||||
|
|
||||||
|
|
@ -103,18 +99,16 @@ Now, let's plot all table boundaries present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot(tables[0], plot_type='contour')
|
>>> camelot.plot(tables[0], kind='contour')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_contour.png
|
.. figure:: ../_static/png/plot_contour.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
:alt: A plot of all contours on a PDF page
|
:alt: A plot of all contours on a PDF page
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
.. _geometry_line:
|
|
||||||
|
|
||||||
line
|
line
|
||||||
^^^^
|
^^^^
|
||||||
|
|
||||||
|
|
@ -122,18 +116,16 @@ Cool, let's plot all line segments present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot(tables[0], plot_type='line')
|
>>> camelot.plot(tables[0], kind='line')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_line.png
|
.. figure:: ../_static/png/plot_line.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
:alt: A plot of all lines on a PDF page
|
:alt: A plot of all lines on a PDF page
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
.. _geometry_joint:
|
|
||||||
|
|
||||||
joint
|
joint
|
||||||
^^^^^
|
^^^^^
|
||||||
|
|
||||||
|
|
@ -141,10 +133,10 @@ Finally, let's plot all line intersections present on the table's PDF page.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> camelot.plot(tables[0], plot_type='joint')
|
>>> camelot.plot(tables[0], kind='joint')
|
||||||
>>> plt.show()
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/geometry_joint.png
|
.. figure:: ../_static/png/plot_joint.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -154,7 +146,7 @@ Finally, let's plot all line intersections present on the table's PDF page.
|
||||||
Specify table areas
|
Specify table areas
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the top left and bottom right coordinates of the table.
|
Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can plot the text on this page and note the top left and bottom right coordinates of the table.
|
||||||
|
|
||||||
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
|
||||||
|
|
||||||
|
|
@ -171,7 +163,7 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
|
||||||
Specify column separators
|
Specify column separators
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
In cases like `these <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by :ref:`plotting the text <geometry_text>` on the page.
|
In cases like `these <../_static/pdf/column_separators.pdf>`__, where the text is very close to each other, it is possible that Camelot may guess the column separators' coordinates incorrectly. To correct this, you can explicitly specify the *x* coordinate for each column separator by plotting the text on the page.
|
||||||
|
|
||||||
You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument.
|
You can pass the column separators as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``columns`` keyword argument.
|
||||||
|
|
||||||
|
|
@ -179,7 +171,7 @@ In case you passed a single column separators string list, and no table area is
|
||||||
|
|
||||||
For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.
|
For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.
|
||||||
|
|
||||||
Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
Let's get back to the *x* coordinates we got from plotting the text that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
@ -287,23 +279,25 @@ Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating t
|
||||||
:alt: A PDF table with short lines
|
:alt: A PDF table with short lines
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
Let's :ref:`plot the table <geometry_table>` for this PDF.
|
Let's plot the table for this PDF.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf')
|
>>> tables = camelot.read_pdf('short_lines.pdf')
|
||||||
>>> tables[0].plot('table')
|
>>> camelot.plot(tables[0], kind='table')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/short_lines_1.png
|
.. figure:: ../_static/png/short_lines_1.png
|
||||||
:alt: A plot of the PDF table with short lines
|
:alt: A plot of the PDF table with short lines
|
||||||
:align: left
|
:align: left
|
||||||
|
|
||||||
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and `plot the table <geometry_table>`_ again.
|
Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
|
>>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
|
||||||
>>> tables[0].plot('table')
|
>>> camelot.plot(tables[0], kind='table')
|
||||||
|
>>> plt.show()
|
||||||
|
|
||||||
.. figure:: ../_static/png/short_lines_2.png
|
.. figure:: ../_static/png/short_lines_2.png
|
||||||
:alt: An improved plot of the PDF table with short lines
|
:alt: An improved plot of the PDF table with short lines
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
|
||||||
|
|
||||||
1. Line segments are detected.
|
1. Line segments are detected.
|
||||||
|
|
||||||
.. image:: ../_static/png/geometry_line.png
|
.. image:: ../_static/png/plot_line.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -49,7 +49,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
|
||||||
|
|
||||||
.. _and: https://en.wikipedia.org/wiki/Logical_conjunction
|
.. _and: https://en.wikipedia.org/wiki/Logical_conjunction
|
||||||
|
|
||||||
.. image:: ../_static/png/geometry_joint.png
|
.. image:: ../_static/png/plot_joint.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -59,7 +59,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
|
||||||
|
|
||||||
.. _or: https://en.wikipedia.org/wiki/Logical_disjunction
|
.. _or: https://en.wikipedia.org/wiki/Logical_disjunction
|
||||||
|
|
||||||
.. image:: ../_static/png/geometry_contour.png
|
.. image:: ../_static/png/plot_contour.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
@ -75,7 +75,7 @@ Let's see how Lattice processes the second page of `this PDF`_, step-by-step.
|
||||||
|
|
||||||
5. Spanning cells are detected using the line segments and line intersections.
|
5. Spanning cells are detected using the line segments and line intersections.
|
||||||
|
|
||||||
.. image:: ../_static/png/geometry_table.png
|
.. image:: ../_static/png/plot_table.png
|
||||||
:height: 674
|
:height: 674
|
||||||
:width: 1366
|
:width: 1366
|
||||||
:scale: 50%
|
:scale: 50%
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ If you have ghostscript, you should see the ghostscript version and copyright in
|
||||||
|
|
||||||
Finally, you can use pip to install Camelot::
|
Finally, you can use pip to install Camelot::
|
||||||
|
|
||||||
$ pip install camelot-py[all]
|
$ pip install camelot-py[cv]
|
||||||
|
|
||||||
From the source code
|
From the source code
|
||||||
--------------------
|
--------------------
|
||||||
|
|
@ -111,6 +111,6 @@ After `installing the dependencies`_, you can install from the source by:
|
||||||
::
|
::
|
||||||
|
|
||||||
$ cd camelot
|
$ cd camelot
|
||||||
$ pip install ".[all]"
|
$ pip install ".[cv]"
|
||||||
|
|
||||||
.. _installing the dependencies: https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip
|
.. _installing the dependencies: https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip
|
||||||
17
setup.py
|
|
@ -15,7 +15,6 @@ with open('README.md', 'r') as f:
|
||||||
|
|
||||||
requires = [
|
requires = [
|
||||||
'click>=6.7',
|
'click>=6.7',
|
||||||
'matplotlib>=2.2.3',
|
|
||||||
'numpy>=1.13.3',
|
'numpy>=1.13.3',
|
||||||
'openpyxl>=2.5.8',
|
'openpyxl>=2.5.8',
|
||||||
'pandas>=0.23.4',
|
'pandas>=0.23.4',
|
||||||
|
|
@ -23,18 +22,24 @@ requires = [
|
||||||
'PyPDF2>=1.26.0'
|
'PyPDF2>=1.26.0'
|
||||||
]
|
]
|
||||||
|
|
||||||
all_requires = [
|
cv_requires = [
|
||||||
'opencv-python>=3.4.2.17'
|
'opencv-python>=3.4.2.17'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
plot_requires = [
|
||||||
|
'matplotlib>=2.2.3',
|
||||||
|
]
|
||||||
|
|
||||||
dev_requires = [
|
dev_requires = [
|
||||||
'codecov>=2.0.15',
|
'codecov>=2.0.15',
|
||||||
'pytest>=3.8.0',
|
'pytest>=3.8.0',
|
||||||
'pytest-cov>=2.6.0',
|
'pytest-cov>=2.6.0',
|
||||||
|
'pytest-mpl>=0.10',
|
||||||
'pytest-runner>=4.2',
|
'pytest-runner>=4.2',
|
||||||
'Sphinx>=1.7.9',
|
'Sphinx>=1.7.9'
|
||||||
'pytest-mpl>=0.10'
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
all_requires = cv_requires + plot_requires
|
||||||
dev_requires = dev_requires + all_requires
|
dev_requires = dev_requires + all_requires
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -52,7 +57,9 @@ def setup_package():
|
||||||
install_requires=requires,
|
install_requires=requires,
|
||||||
extras_require={
|
extras_require={
|
||||||
'all': all_requires,
|
'all': all_requires,
|
||||||
'dev': dev_requires
|
'cv': cv_requires,
|
||||||
|
'dev': dev_requires,
|
||||||
|
'plot': plot_requires
|
||||||
},
|
},
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
|
@ -16,15 +16,15 @@ testdir = os.path.join(testdir, "files")
|
||||||
def test_text_plot():
|
def test_text_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], plot_type='text')
|
return camelot.plot(tables[0], kind='text')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True)
|
baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_table_plot():
|
def test_grid_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], plot_type='table')
|
return camelot.plot(tables[0], kind='grid')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
|
|
@ -32,7 +32,7 @@ def test_table_plot():
|
||||||
def test_contour_plot():
|
def test_contour_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], plot_type='contour')
|
return camelot.plot(tables[0], kind='contour')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
|
|
@ -40,7 +40,7 @@ def test_contour_plot():
|
||||||
def test_line_plot():
|
def test_line_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], plot_type='line')
|
return camelot.plot(tables[0], kind='line')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
|
|
@ -48,4 +48,4 @@ def test_line_plot():
|
||||||
def test_joint_plot():
|
def test_joint_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
return camelot.plot(tables[0], plot_type='joint')
|
return camelot.plot(tables[0], kind='joint')
|
||||||
|
|
|
||||||