[MRG + 1] Create a new figure and test each plot type #127 (#179)

* [MRG] Create a new figure and test each plot type #127

 - move `plot()` to `plotting.py` as `plot_pdf()`
 - modify plotting functions to return matplotlib figures
 - add `test_plotting.py` and baseline images
 - import `plot_pdf()` in `__init__`
 - update `cli.py` to use `plot_pdf()`
 - update advanced usage docs to reflect changes

* Change matplotlib backend for image comparison tests

* Update plotting and tests
 - use matplotlib rectangle instead of `cv2.rectangle` in
`plot_contour()`
 - set matplotlib backend in `tests/__init__`
 - update contour plot baseline image
 - update `test_plotting` with more checks

* Update plot tests and config
 - remove unnecessary asserts
 - update setup.cfg and makefile with `--mpl`

* Add  to

* Add tolerance

* remove text from baseline plots
update plot tests with `remove_text`

* Change method name, update docs and add pep8

* Update docs
pull/2/head
Suyash Behera 2018-11-02 20:57:02 +05:30 committed by Vinayak Mehta
parent 79db6e3d1b
commit c0e9235164
16 changed files with 186 additions and 77 deletions

View File

@ -15,7 +15,7 @@ install:
pip install ".[dev]" pip install ".[dev]"
test: test:
pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
docs: docs:
cd docs && make html cd docs && make html
@ -25,4 +25,4 @@ publish:
pip install twine pip install twine
python setup.py sdist python setup.py sdist
twine upload dist/* twine upload dist/*
rm -fr build dist .egg camelot_py.egg-info rm -fr build dist .egg camelot_py.egg-info

View File

@ -6,6 +6,7 @@ from click import HelpFormatter
from .__version__ import __version__ from .__version__ import __version__
from .io import read_pdf from .io import read_pdf
from .plotting import plot
def _write_usage(self, prog, args='', prefix='Usage: '): def _write_usage(self, prog, args='', prefix='Usage: '):

View File

@ -3,9 +3,11 @@
import logging import logging
import click import click
import matplotlib.pyplot as plt
from . import __version__ from . import __version__
from .io import read_pdf from .io import read_pdf
from .plotting import plot
logger = logging.getLogger('camelot') logger = logging.getLogger('camelot')
@ -81,7 +83,7 @@ def cli(ctx, *args, **kwargs):
help='Number of times for erosion/dilation will be applied.') help='Number of times for erosion/dilation will be applied.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
help='Plot geometry found on PDF page, for debugging.') help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True)) @click.argument('filepath', type=click.Path(exists=True))
@pass_config @pass_config
def lattice(c, *args, **kwargs): def lattice(c, *args, **kwargs):
@ -107,7 +109,8 @@ def lattice(c, *args, **kwargs):
click.echo('Found {} tables'.format(tables.n)) click.echo('Found {} tables'.format(tables.n))
if plot_type is not None: if plot_type is not None:
for table in tables: for table in tables:
table.plot(plot_type) plot(table, plot_type=plot_type)
plt.show()
else: else:
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError('Please specify output file path using --output')
@ -128,7 +131,7 @@ def lattice(c, *args, **kwargs):
' used to combine text horizontally, to generate columns.') ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type', @click.option('-plot', '--plot_type',
type=click.Choice(['text', 'table']), type=click.Choice(['text', 'table']),
help='Plot geometry found on PDF page for debugging.') help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True)) @click.argument('filepath', type=click.Path(exists=True))
@pass_config @pass_config
def stream(c, *args, **kwargs): def stream(c, *args, **kwargs):
@ -153,7 +156,8 @@ def stream(c, *args, **kwargs):
click.echo('Found {} tables'.format(tables.n)) click.echo('Found {} tables'.format(tables.n))
if plot_type is not None: if plot_type is not None:
for table in tables: for table in tables:
table.plot(plot_type) plot(table, plot_type=plot_type)
plt.show()
else: else:
if output is None: if output is None:
raise click.UsageError('Please specify output file path using --output') raise click.UsageError('Please specify output file path using --output')

View File

@ -7,8 +7,6 @@ import tempfile
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from .plotting import *
class Cell(object): class Cell(object):
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
@ -321,33 +319,6 @@ class Table(object):
cell.hspan = True cell.hspan = True
return self return self
def plot(self, geometry_type):
"""Plot geometry found on PDF page based on geometry_type
specified, useful for debugging and playing with different
parameters to get the best output.
Parameters
----------
geometry_type : str
The geometry type for which a plot should be generated.
Can be 'text', 'table', 'contour', 'joint', 'line'
"""
if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']:
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
geometry_type))
if geometry_type == 'text':
plot_text(self._text)
elif geometry_type == 'table':
plot_table(self)
elif geometry_type == 'contour':
plot_contour(self._image)
elif geometry_type == 'joint':
plot_joint(self._image)
elif geometry_type == 'line':
plot_line(self._segments)
def to_csv(self, path, **kwargs): def to_csv(self, path, **kwargs):
"""Writes Table to a comma-separated values (csv) file. """Writes Table to a comma-separated values (csv) file.

View File

@ -141,9 +141,6 @@ class PDFHandler(object):
------- -------
tables : camelot.core.TableList tables : camelot.core.TableList
List of tables found in PDF. List of tables found in PDF.
geometry : camelot.core.GeometryList
List of geometry objects (contours, lines, joints) found
in PDF.
""" """
tables = [] tables = []

View File

@ -1,15 +1,59 @@
import cv2 # -*- coding: utf-8 -*-
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
def plot(table, plot_type='text', filepath=None):
"""Plot elements found on PDF page based on plot_type
specified, useful for debugging and playing with different
parameters to get the best output.
Parameters
----------
table: Table
A Camelot Table.
plot_type : str, optional (default: 'text')
{'text', 'table', 'contour', 'joint', 'line'}
The element type for which a plot should be generated.
filepath: str, optional (default: None)
Absolute path for saving the generated plot.
Returns
-------
fig : matplotlib.fig.Figure
"""
if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']:
raise NotImplementedError("{} cannot be plotted with flavor='stream'".format(
plot_type))
if plot_type == 'text':
fig = plot_text(table._text)
elif plot_type == 'table':
fig = plot_table(table)
elif plot_type == 'contour':
fig = plot_contour(table._image)
elif plot_type == 'joint':
fig = plot_joint(table._image)
elif plot_type == 'line':
fig = plot_line(table._segments)
if filepath:
plt.savefig(filepath)
return fig
def plot_text(text): def plot_text(text):
"""Generates a plot for all text present on the PDF page. """Generates a plot for all text elements present
on the PDF page.
Parameters Parameters
---------- ----------
text : list text : list
Returns
-------
fig : matplotlib.fig.Figure
""" """
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal') ax = fig.add_subplot(111, aspect='equal')
@ -26,83 +70,116 @@ def plot_text(text):
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
plt.show() return fig
def plot_table(table): def plot_table(table):
"""Generates a plot for the table. """Generates a plot for the detected tables
on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
Returns
-------
fig : matplotlib.fig.Figure
""" """
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
for row in table.cells: for row in table.cells:
for cell in row: for cell in row:
if cell.left: if cell.left:
plt.plot([cell.lb[0], cell.lt[0]], ax.plot([cell.lb[0], cell.lt[0]],
[cell.lb[1], cell.lt[1]]) [cell.lb[1], cell.lt[1]])
if cell.right: if cell.right:
plt.plot([cell.rb[0], cell.rt[0]], ax.plot([cell.rb[0], cell.rt[0]],
[cell.rb[1], cell.rt[1]]) [cell.rb[1], cell.rt[1]])
if cell.top: if cell.top:
plt.plot([cell.lt[0], cell.rt[0]], ax.plot([cell.lt[0], cell.rt[0]],
[cell.lt[1], cell.rt[1]]) [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
plt.plot([cell.lb[0], cell.rb[0]], ax.plot([cell.lb[0], cell.rb[0]],
[cell.lb[1], cell.rb[1]]) [cell.lb[1], cell.rb[1]])
plt.show() return fig
def plot_contour(image): def plot_contour(image):
"""Generates a plot for all table boundaries present on the """Generates a plot for all table boundaries present
PDF page. on the PDF page.
Parameters Parameters
---------- ----------
image : tuple image : tuple
Returns
-------
fig : matplotlib.fig.Figure
""" """
img, table_bbox = image img, table_bbox = image
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
for t in table_bbox.keys(): for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]), ax.add_patch(
(t[2], t[3]), (255, 0, 0), 20) patches.Rectangle(
plt.imshow(img) (t[0], t[1]),
plt.show() t[2] - t[0],
t[3] - t[1],
fill=None,
edgecolor='red'
)
)
ax.imshow(img)
return fig
def plot_joint(image): def plot_joint(image):
"""Generates a plot for all line intersections present on the """Generates a plot for all line intersections present
PDF page. on the PDF page.
Parameters Parameters
---------- ----------
image : tuple image : tuple
Returns
-------
fig : matplotlib.fig.Figure
""" """
img, table_bbox = image img, table_bbox = image
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
x_coord = [] x_coord = []
y_coord = [] y_coord = []
for k in table_bbox.keys(): for k in table_bbox.keys():
for coord in table_bbox[k]: for coord in table_bbox[k]:
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
plt.plot(x_coord, y_coord, 'ro') ax.plot(x_coord, y_coord, 'ro')
plt.imshow(img) ax.imshow(img)
plt.show() return fig
def plot_line(segments): def plot_line(segments):
"""Generates a plot for all line segments present on the PDF page. """Generates a plot for all line segments present
on the PDF page.
Parameters Parameters
---------- ----------
segments : tuple segments : tuple
Returns
-------
fig : matplotlib.fig.Figure
""" """
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
vertical, horizontal = segments vertical, horizontal = segments
for v in vertical: for v in vertical:
plt.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
plt.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
plt.show() return fig

View File

@ -27,12 +27,12 @@ To process background lines, you can pass ``process_background=True``.
.. csv-table:: .. csv-table::
:file: ../_static/csv/background_lines.csv :file: ../_static/csv/background_lines.csv
Plot geometry Visual debugging
------------- ----------------
You can use a :class:`table <camelot.core.Table>` object's :meth:`plot() <camelot.core.TableList.plot>` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. You can use the :meth:`plot() <camelot.plotting.plot>` method to generate a `matplotlib <https://matplotlib.org/>`_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters.
The following geometries are available for plotting. You can pass them to the :meth:`plot() <camelot.core.TableList.plot>` method, which will then generate a `matplotlib <https://matplotlib.org/>`_ plot for the passed geometry. You can specify the type of element you want to plot using the ``plot_type`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported:
- 'text' - 'text'
- 'table' - 'table'
@ -40,9 +40,9 @@ The following geometries are available for plotting. You can pass them to the :m
- 'line' - 'line'
- 'joint' - 'joint'
.. note:: The last three geometries can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``. .. note:: The last three plot types can only be used with :ref:`Lattice <lattice>`, i.e. when ``flavor='lattice'``.
Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out.
:: ::
@ -59,7 +59,8 @@ Let's plot all the text present on the table's PDF page.
:: ::
>>> tables[0].plot('text') >>> camelot.plot(tables[0], plot_type='text')
>>> plt.show()
.. figure:: ../_static/png/geometry_text.png .. figure:: ../_static/png/geometry_text.png
:height: 674 :height: 674
@ -77,11 +78,12 @@ This, as we shall later see, is very helpful with :ref:`Stream <stream>` for not
table table
^^^^^ ^^^^^
Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) Let's plot the table (to see if it was detected correctly or not). This plot type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.)
:: ::
>>> tables[0].plot('table') >>> camelot.plot(tables[0], plot_type='table')
>>> plt.show()
.. figure:: ../_static/png/geometry_table.png .. figure:: ../_static/png/geometry_table.png
:height: 674 :height: 674
@ -101,7 +103,8 @@ Now, let's plot all table boundaries present on the table's PDF page.
:: ::
>>> tables[0].plot('contour') >>> camelot.plot(tables[0], plot_type='contour')
>>> plt.show()
.. figure:: ../_static/png/geometry_contour.png .. figure:: ../_static/png/geometry_contour.png
:height: 674 :height: 674
@ -119,7 +122,8 @@ Cool, let's plot all line segments present on the table's PDF page.
:: ::
>>> tables[0].plot('line') >>> camelot.plot(tables[0], plot_type='line')
>>> plt.show()
.. figure:: ../_static/png/geometry_line.png .. figure:: ../_static/png/geometry_line.png
:height: 674 :height: 674
@ -137,7 +141,8 @@ Finally, let's plot all line intersections present on the table's PDF page.
:: ::
>>> tables[0].plot('joint') >>> camelot.plot(tables[0], plot_type='joint')
>>> plt.show()
.. figure:: ../_static/png/geometry_joint.png .. figure:: ../_static/png/geometry_joint.png
:height: 674 :height: 674

View File

@ -2,5 +2,5 @@
test=pytest test=pytest
[tool:pytest] [tool:pytest]
addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests
python_files = tests/test_*.py python_files = tests/test_*.py

View File

@ -32,7 +32,8 @@ dev_requires = [
'pytest>=3.8.0', 'pytest>=3.8.0',
'pytest-cov>=2.6.0', 'pytest-cov>=2.6.0',
'pytest-runner>=4.2', 'pytest-runner>=4.2',
'Sphinx>=1.7.9' 'Sphinx>=1.7.9',
'pytest-mpl>=0.10'
] ]
dev_requires = dev_requires + all_requires dev_requires = dev_requires + all_requires

View File

@ -0,0 +1,2 @@
import matplotlib
matplotlib.use('agg')

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.8 KiB

View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
import os
import pytest
import camelot
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_text_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], plot_type='text')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_table_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], plot_type='table')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_contour_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], plot_type='contour')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_line_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], plot_type='line')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_joint_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
return camelot.plot(tables[0], plot_type='joint')