* [MRG] Create a new figure and test each plot type #127 - move `plot()` to `plotting.py` as `plot_pdf()` - modify plotting functions to return matplotlib figures - add `test_plotting.py` and baseline images - import `plot_pdf()` in `__init__` - update `cli.py` to use `plot_pdf()` - update advanced usage docs to reflect changes * Change matplotlib backend for image comparison tests * Update plotting and tests - use matplotlib rectangle instead of `cv2.rectangle` in `plot_contour()` - set matplotlib backend in `tests/__init__` - update contour plot baseline image - update `test_plotting` with more checks * Update plot tests and config - remove unnecessary asserts - update setup.cfg and makefile with `--mpl` * Add to * Add tolerance * remove text from baseline plots update plot tests with `remove_text` * Change method name, update docs and add pep8 * Update docspull/2/head
parent
79db6e3d1b
commit
c0e9235164
4
Makefile
4
Makefile
|
|
@ -15,7 +15,7 @@ install:
|
|||
pip install ".[dev]"
|
||||
|
||||
test:
|
||||
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests
|
||||
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
|
||||
|
||||
docs:
|
||||
cd docs && make html
|
||||
|
|
@ -25,4 +25,4 @@ publish:
|
|||
pip install twine
|
||||
python setup.py sdist
|
||||
twine upload dist/*
|
||||
rm -fr build dist .egg camelot_py.egg-info
|
||||
rm -fr build dist .egg camelot_py.egg-info
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from click import HelpFormatter
|
|||
|
||||
from .__version__ import __version__
|
||||
from .io import read_pdf
|
||||
from .plotting import plot
|
||||
|
||||
|
||||
def _write_usage(self, prog, args='', prefix='Usage: '):
|
||||
|
|
|
|||
|
|
@ -3,9 +3,11 @@
|
|||
import logging
|
||||
|
||||
import click
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from . import __version__
|
||||
from .io import read_pdf
|
||||
from .plotting import plot
|
||||
|
||||
|
||||
logger = logging.getLogger('camelot')
|
||||
|
|
@ -81,7 +83,7 @@ def cli(ctx, *args, **kwargs):
|
|||
help='Number of times for erosion/dilation will be applied.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
|
||||
help='Plot geometry found on PDF page, for debugging.')
|
||||
help='Plot elements found on PDF page for visual debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def lattice(c, *args, **kwargs):
|
||||
|
|
@ -107,7 +109,8 @@ def lattice(c, *args, **kwargs):
|
|||
click.echo('Found {} tables'.format(tables.n))
|
||||
if plot_type is not None:
|
||||
for table in tables:
|
||||
table.plot(plot_type)
|
||||
plot(table, plot_type=plot_type)
|
||||
plt.show()
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
|
|
@ -128,7 +131,7 @@ def lattice(c, *args, **kwargs):
|
|||
' used to combine text horizontally, to generate columns.')
|
||||
@click.option('-plot', '--plot_type',
|
||||
type=click.Choice(['text', 'table']),
|
||||
help='Plot geometry found on PDF page for debugging.')
|
||||
help='Plot elements found on PDF page for visual debugging.')
|
||||
@click.argument('filepath', type=click.Path(exists=True))
|
||||
@pass_config
|
||||
def stream(c, *args, **kwargs):
|
||||
|
|
@ -153,7 +156,8 @@ def stream(c, *args, **kwargs):
|
|||
click.echo('Found {} tables'.format(tables.n))
|
||||
if plot_type is not None:
|
||||
for table in tables:
|
||||
table.plot(plot_type)
|
||||
plot(table, plot_type=plot_type)
|
||||
plt.show()
|
||||
else:
|
||||
if output is None:
|
||||
raise click.UsageError('Please specify output file path using --output')
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@ import tempfile
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .plotting import *
|
||||
|
||||
|
||||
class Cell(object):
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
|
|
@ -321,33 +319,6 @@ class Table(object):
|
|||
cell.hspan = True
|
||||
return self
|
||||
|
||||
def plot(self, geometry_type):
|
||||
"""Plot geometry found on PDF page based on geometry_type
|
||||
specified, useful for debugging and playing with different
|
||||
parameters to get the best output.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
geometry_type : str
|
||||
The geometry type for which a plot should be generated.
|
||||
Can be 'text', 'table', 'contour', 'joint', 'line'
|
||||
|
||||
"""
|
||||
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
|
||||
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
|
||||
geometry_type))
|
||||
|
||||
if geometry_type == 'text':
|
||||
plot_text(self._text)
|
||||
elif geometry_type == 'table':
|
||||
plot_table(self)
|
||||
elif geometry_type == 'contour':
|
||||
plot_contour(self._image)
|
||||
elif geometry_type == 'joint':
|
||||
plot_joint(self._image)
|
||||
elif geometry_type == 'line':
|
||||
plot_line(self._segments)
|
||||
|
||||
def to_csv(self, path, **kwargs):
|
||||
"""Writes Table to a comma-separated values (csv) file.
|
||||
|
||||
|
|
|
|||
|
|
@ -141,9 +141,6 @@ class PDFHandler(object):
|
|||
-------
|
||||
tables : camelot.core.TableList
|
||||
List of tables found in PDF.
|
||||
geometry : camelot.core.GeometryList
|
||||
List of geometry objects (contours, lines, joints) found
|
||||
in PDF.
|
||||
|
||||
"""
|
||||
tables = []
|
||||
|
|
|
|||
|
|
@ -1,15 +1,59 @@
|
|||
import cv2
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
|
||||
|
||||
def plot(table, plot_type='text', filepath=None):
|
||||
"""Plot elements found on PDF page based on plot_type
|
||||
specified, useful for debugging and playing with different
|
||||
parameters to get the best output.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table: Table
|
||||
A Camelot Table.
|
||||
plot_type : str, optional (default: 'text')
|
||||
{'text', 'table', 'contour', 'joint', 'line'}
|
||||
The element type for which a plot should be generated.
|
||||
filepath: str, optional (default: None)
|
||||
Absolute path for saving the generated plot.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']:
|
||||
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
|
||||
plot_type))
|
||||
if plot_type == 'text':
|
||||
fig = plot_text(table._text)
|
||||
elif plot_type == 'table':
|
||||
fig = plot_table(table)
|
||||
elif plot_type == 'contour':
|
||||
fig = plot_contour(table._image)
|
||||
elif plot_type == 'joint':
|
||||
fig = plot_joint(table._image)
|
||||
elif plot_type == 'line':
|
||||
fig = plot_line(table._segments)
|
||||
if filepath:
|
||||
plt.savefig(filepath)
|
||||
return fig
|
||||
|
||||
|
||||
def plot_text(text):
|
||||
"""Generates a plot for all text present on the PDF page.
|
||||
"""Generates a plot for all text elements present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
|
|
@ -26,83 +70,116 @@ def plot_text(text):
|
|||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
plt.show()
|
||||
return fig
|
||||
|
||||
|
||||
def plot_table(table):
|
||||
"""Generates a plot for the table.
|
||||
"""Generates a plot for the detected tables
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
for row in table.cells:
|
||||
for cell in row:
|
||||
if cell.left:
|
||||
plt.plot([cell.lb[0], cell.lt[0]],
|
||||
ax.plot([cell.lb[0], cell.lt[0]],
|
||||
[cell.lb[1], cell.lt[1]])
|
||||
if cell.right:
|
||||
plt.plot([cell.rb[0], cell.rt[0]],
|
||||
ax.plot([cell.rb[0], cell.rt[0]],
|
||||
[cell.rb[1], cell.rt[1]])
|
||||
if cell.top:
|
||||
plt.plot([cell.lt[0], cell.rt[0]],
|
||||
ax.plot([cell.lt[0], cell.rt[0]],
|
||||
[cell.lt[1], cell.rt[1]])
|
||||
if cell.bottom:
|
||||
plt.plot([cell.lb[0], cell.rb[0]],
|
||||
ax.plot([cell.lb[0], cell.rb[0]],
|
||||
[cell.lb[1], cell.rb[1]])
|
||||
plt.show()
|
||||
return fig
|
||||
|
||||
|
||||
def plot_contour(image):
|
||||
"""Generates a plot for all table boundaries present on the
|
||||
PDF page.
|
||||
"""Generates a plot for all table boundaries present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
image : tuple
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
img, table_bbox = image
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
for t in table_bbox.keys():
|
||||
cv2.rectangle(img, (t[0], t[1]),
|
||||
(t[2], t[3]), (255, 0, 0), 20)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
fill=None,
|
||||
edgecolor='red'
|
||||
)
|
||||
)
|
||||
ax.imshow(img)
|
||||
return fig
|
||||
|
||||
|
||||
def plot_joint(image):
|
||||
"""Generates a plot for all line intersections present on the
|
||||
PDF page.
|
||||
"""Generates a plot for all line intersections present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
image : tuple
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
img, table_bbox = image
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in table_bbox.keys():
|
||||
for coord in table_bbox[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
ax.plot(x_coord, y_coord, 'ro')
|
||||
ax.imshow(img)
|
||||
return fig
|
||||
|
||||
|
||||
def plot_line(segments):
|
||||
"""Generates a plot for all line segments present on the PDF page.
|
||||
"""Generates a plot for all line segments present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
segments : tuple
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
vertical, horizontal = segments
|
||||
for v in vertical:
|
||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in horizontal:
|
||||
plt.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
return fig
|
||||
|
|
|
|||
|
|
@ -27,12 +27,12 @@ To process background lines, you can pass ``process_background=True``.
|
|||
.. csv-table::
|
||||
:file: ../_static/csv/background_lines.csv
|
||||
|
||||
Plot geometry
|
||||
-------------
|
||||
Visual debugging
|
||||
----------------
|
||||
|
||||
You can use a :class:`table <camelot.core.Table>` object's :meth:`plot() <camelot.core.TableList.plot>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
||||
You can use the :meth:`plot() <camelot.plotting.plot>` method to generate a `matplotlib <https://matplotlib.org/>`_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
|
||||
|
||||
The following geometries are available for plotting. You can pass them to the :meth:`plot() <camelot.core.TableList.plot>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry.
|
||||
You can specify the type of element you want to plot using the ``plot_type`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported:
|
||||
|
||||
- 'text'
|
||||
- 'table'
|
||||
|
|
@ -40,9 +40,9 @@ The following geometries are available for plotting. You can pass them to the :m
|
|||
- 'line'
|
||||
- 'joint'
|
||||
|
||||
.. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
|
||||
.. note:: The last three plot types can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
|
||||
|
||||
Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
||||
Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
|
||||
|
||||
::
|
||||
|
||||
|
|
@ -59,7 +59,8 @@ Let's plot all the text present on the table's PDF page.
|
|||
|
||||
::
|
||||
|
||||
>>> tables[0].plot('text')
|
||||
>>> camelot.plot(tables[0], plot_type='text')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/geometry_text.png
|
||||
:height: 674
|
||||
|
|
@ -77,11 +78,12 @@ This, as we shall later see, is very helpful with :ref:`Stream <stream>` for not
|
|||
table
|
||||
^^^^^
|
||||
|
||||
Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.)
|
||||
Let's plot the table (to see if it was detected correctly or not). This plot type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.)
|
||||
|
||||
::
|
||||
|
||||
>>> tables[0].plot('table')
|
||||
>>> camelot.plot(tables[0], plot_type='table')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/geometry_table.png
|
||||
:height: 674
|
||||
|
|
@ -101,7 +103,8 @@ Now, let's plot all table boundaries present on the table's PDF page.
|
|||
|
||||
::
|
||||
|
||||
>>> tables[0].plot('contour')
|
||||
>>> camelot.plot(tables[0], plot_type='contour')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/geometry_contour.png
|
||||
:height: 674
|
||||
|
|
@ -119,7 +122,8 @@ Cool, let's plot all line segments present on the table's PDF page.
|
|||
|
||||
::
|
||||
|
||||
>>> tables[0].plot('line')
|
||||
>>> camelot.plot(tables[0], plot_type='line')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/geometry_line.png
|
||||
:height: 674
|
||||
|
|
@ -137,7 +141,8 @@ Finally, let's plot all line intersections present on the table's PDF page.
|
|||
|
||||
::
|
||||
|
||||
>>> tables[0].plot('joint')
|
||||
>>> camelot.plot(tables[0], plot_type='joint')
|
||||
>>> plt.show()
|
||||
|
||||
.. figure:: ../_static/png/geometry_joint.png
|
||||
:height: 674
|
||||
|
|
|
|||
|
|
@ -2,5 +2,5 @@
|
|||
test=pytest
|
||||
|
||||
[tool:pytest]
|
||||
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests
|
||||
python_files = tests/test_*.py
|
||||
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
|
||||
python_files = tests/test_*.py
|
||||
|
|
|
|||
3
setup.py
3
setup.py
|
|
@ -32,7 +32,8 @@ dev_requires = [
|
|||
'pytest>=3.8.0',
|
||||
'pytest-cov>=2.6.0',
|
||||
'pytest-runner>=4.2',
|
||||
'Sphinx>=1.7.9'
|
||||
'Sphinx>=1.7.9',
|
||||
'pytest-mpl>=0.10'
|
||||
]
|
||||
dev_requires = dev_requires + all_requires
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
import matplotlib
|
||||
matplotlib.use('agg')
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 35 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 6.6 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 8.2 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 8.8 KiB |
|
|
@ -0,0 +1,51 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
import camelot
|
||||
|
||||
|
||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||
testdir = os.path.join(testdir, "files")
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_text_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], plot_type='text')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_table_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], plot_type='table')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_contour_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], plot_type='contour')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_line_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], plot_type='line')
|
||||
|
||||
|
||||
@pytest.mark.mpl_image_compare(
|
||||
baseline_dir="files/baseline_plots", remove_text=True)
|
||||
def test_joint_plot():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
return camelot.plot(tables[0], plot_type='joint')
|
||||
Loading…
Reference in New Issue