diff --git a/Makefile b/Makefile index a4bea7d..d0b54b0 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ install: pip install ".[dev]" test: - pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests + pytest --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests docs: cd docs && make html @@ -25,4 +25,4 @@ publish: pip install twine python setup.py sdist twine upload dist/* - rm -fr build dist .egg camelot_py.egg-info \ No newline at end of file + rm -fr build dist .egg camelot_py.egg-info diff --git a/camelot/__init__.py b/camelot/__init__.py index 364cd72..d8a41b9 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -6,6 +6,7 @@ from click import HelpFormatter from .__version__ import __version__ from .io import read_pdf +from .plotting import plot def _write_usage(self, prog, args='', prefix='Usage: '): diff --git a/camelot/cli.py b/camelot/cli.py index e30b204..8d67df7 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -3,9 +3,11 @@ import logging import click +import matplotlib.pyplot as plt from . import __version__ from .io import read_pdf +from .plotting import plot logger = logging.getLogger('camelot') @@ -81,7 +83,7 @@ def cli(ctx, *args, **kwargs): help='Number of times for erosion/dilation will be applied.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), - help='Plot geometry found on PDF page, for debugging.') + help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def lattice(c, *args, **kwargs): @@ -107,7 +109,8 @@ def lattice(c, *args, **kwargs): click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: - table.plot(plot_type) + plot(table, plot_type=plot_type) + plt.show() else: if output is None: raise click.UsageError('Please specify output file path using --output') @@ -128,7 +131,7 @@ def lattice(c, *args, **kwargs): ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'table']), - help='Plot geometry found on PDF page for debugging.') + help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def stream(c, *args, **kwargs): @@ -153,7 +156,8 @@ def stream(c, *args, **kwargs): click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: - table.plot(plot_type) + plot(table, plot_type=plot_type) + plt.show() else: if output is None: raise click.UsageError('Please specify output file path using --output') diff --git a/camelot/core.py b/camelot/core.py index d6eb3d7..45b316b 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -7,8 +7,6 @@ import tempfile import numpy as np import pandas as pd -from .plotting import * - class Cell(object): """Defines a cell in a table with coordinates relative to a @@ -321,33 +319,6 @@ class Table(object): cell.hspan = True return self - def plot(self, geometry_type): - """Plot geometry found on PDF page based on geometry_type - specified, useful for debugging and playing with different - parameters to get the best output. - - Parameters - ---------- - geometry_type : str - The geometry type for which a plot should be generated. - Can be 'text', 'table', 'contour', 'joint', 'line' - - """ - if self.flavor == 'stream' and geometry_type in ['contour', 'joint', 'line']: - raise NotImplementedError("{} cannot be plotted with flavor='stream'".format( - geometry_type)) - - if geometry_type == 'text': - plot_text(self._text) - elif geometry_type == 'table': - plot_table(self) - elif geometry_type == 'contour': - plot_contour(self._image) - elif geometry_type == 'joint': - plot_joint(self._image) - elif geometry_type == 'line': - plot_line(self._segments) - def to_csv(self, path, **kwargs): """Writes Table to a comma-separated values (csv) file. diff --git a/camelot/handlers.py b/camelot/handlers.py index b6dc65c..47070a1 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -141,9 +141,6 @@ class PDFHandler(object): ------- tables : camelot.core.TableList List of tables found in PDF. - geometry : camelot.core.GeometryList - List of geometry objects (contours, lines, joints) found - in PDF. """ tables = [] diff --git a/camelot/plotting.py b/camelot/plotting.py index bef06f2..73d5b37 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -1,15 +1,59 @@ -import cv2 +# -*- coding: utf-8 -*- + import matplotlib.pyplot as plt import matplotlib.patches as patches +def plot(table, plot_type='text', filepath=None): + """Plot elements found on PDF page based on plot_type + specified, useful for debugging and playing with different + parameters to get the best output. + + Parameters + ---------- + table: Table + A Camelot Table. + plot_type : str, optional (default: 'text') + {'text', 'table', 'contour', 'joint', 'line'} + The element type for which a plot should be generated. + filepath: str, optional (default: None) + Absolute path for saving the generated plot. + + Returns + ------- + fig : matplotlib.fig.Figure + + """ + if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']: + raise NotImplementedError("{} cannot be plotted with flavor='stream'".format( + plot_type)) + if plot_type == 'text': + fig = plot_text(table._text) + elif plot_type == 'table': + fig = plot_table(table) + elif plot_type == 'contour': + fig = plot_contour(table._image) + elif plot_type == 'joint': + fig = plot_joint(table._image) + elif plot_type == 'line': + fig = plot_line(table._segments) + if filepath: + plt.savefig(filepath) + return fig + + def plot_text(text): - """Generates a plot for all text present on the PDF page. + """Generates a plot for all text elements present + on the PDF page. Parameters ---------- text : list + Returns + ------- + fig : matplotlib.fig.Figure + """ fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') @@ -26,83 +70,116 @@ def plot_text(text): ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() + return fig def plot_table(table): - """Generates a plot for the table. + """Generates a plot for the detected tables + on the PDF page. Parameters ---------- table : camelot.core.Table + Returns + ------- + fig : matplotlib.fig.Figure + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') for row in table.cells: for cell in row: if cell.left: - plt.plot([cell.lb[0], cell.lt[0]], + ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]]) if cell.right: - plt.plot([cell.rb[0], cell.rt[0]], + ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]]) if cell.top: - plt.plot([cell.lt[0], cell.rt[0]], + ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) if cell.bottom: - plt.plot([cell.lb[0], cell.rb[0]], + ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) - plt.show() + return fig def plot_contour(image): - """Generates a plot for all table boundaries present on the - PDF page. + """Generates a plot for all table boundaries present + on the PDF page. Parameters ---------- image : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ img, table_bbox = image + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') for t in table_bbox.keys(): - cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 20) - plt.imshow(img) - plt.show() + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + fill=None, + edgecolor='red' + ) + ) + ax.imshow(img) + return fig def plot_joint(image): - """Generates a plot for all line intersections present on the - PDF page. + """Generates a plot for all line intersections present + on the PDF page. Parameters ---------- image : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ img, table_bbox = image + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') x_coord = [] y_coord = [] for k in table_bbox.keys(): for coord in table_bbox[k]: x_coord.append(coord[0]) y_coord.append(coord[1]) - plt.plot(x_coord, y_coord, 'ro') - plt.imshow(img) - plt.show() + ax.plot(x_coord, y_coord, 'ro') + ax.imshow(img) + return fig def plot_line(segments): - """Generates a plot for all line segments present on the PDF page. + """Generates a plot for all line segments present + on the PDF page. Parameters ---------- segments : tuple + Returns + ------- + fig : matplotlib.fig.Figure + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') vertical, horizontal = segments for v in vertical: - plt.plot([v[0], v[2]], [v[1], v[3]]) + ax.plot([v[0], v[2]], [v[1], v[3]]) for h in horizontal: - plt.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() + ax.plot([h[0], h[2]], [h[1], h[3]]) + return fig diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index e697949..7d6b349 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -27,12 +27,12 @@ To process background lines, you can pass ``process_background=True``. .. csv-table:: :file: ../_static/csv/background_lines.csv -Plot geometry -------------- +Visual debugging +---------------- -You can use a :class:`table ` object's :meth:`plot() ` method to plot various geometries that were detected by Camelot while processing the PDF page. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. +You can use the :meth:`plot() ` method to generate a `matplotlib `_ plot of various elements that were detected on the PDF page while processing it. This can help you select table areas, column separators and debug bad table outputs, by tweaking different configuration parameters. -The following geometries are available for plotting. You can pass them to the :meth:`plot() ` method, which will then generate a `matplotlib `_ plot for the passed geometry. +You can specify the type of element you want to plot using the ``plot_type`` keyword argument. The generated plot can be saved to a file by passing a ``filename`` keyword argument. The following plot types are supported: - 'text' - 'table' @@ -40,9 +40,9 @@ The following geometries are available for plotting. You can pass them to the :m - 'line' - 'joint' -.. note:: The last three geometries can only be used with :ref:`Lattice `, i.e. when ``flavor='lattice'``. +.. note:: The last three plot types can only be used with :ref:`Lattice `, i.e. when ``flavor='lattice'``. -Let's generate a plot for each geometry using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. +Let's generate a plot for each type using this `PDF <../_static/pdf/foo.pdf>`__ as an example. First, let's get all the tables out. :: @@ -59,7 +59,8 @@ Let's plot all the text present on the table's PDF page. :: - >>> tables[0].plot('text') + >>> camelot.plot(tables[0], plot_type='text') + >>> plt.show() .. figure:: ../_static/png/geometry_text.png :height: 674 @@ -77,11 +78,12 @@ This, as we shall later see, is very helpful with :ref:`Stream ` for not table ^^^^^ -Let's plot the table (to see if it was detected correctly or not). This geometry type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) +Let's plot the table (to see if it was detected correctly or not). This plot type, along with contour, line and joint is useful for debugging and improving the extraction output, in case the table wasn't detected correctly. (More on that later.) :: - >>> tables[0].plot('table') + >>> camelot.plot(tables[0], plot_type='table') + >>> plt.show() .. figure:: ../_static/png/geometry_table.png :height: 674 @@ -101,7 +103,8 @@ Now, let's plot all table boundaries present on the table's PDF page. :: - >>> tables[0].plot('contour') + >>> camelot.plot(tables[0], plot_type='contour') + >>> plt.show() .. figure:: ../_static/png/geometry_contour.png :height: 674 @@ -119,7 +122,8 @@ Cool, let's plot all line segments present on the table's PDF page. :: - >>> tables[0].plot('line') + >>> camelot.plot(tables[0], plot_type='line') + >>> plt.show() .. figure:: ../_static/png/geometry_line.png :height: 674 @@ -137,7 +141,8 @@ Finally, let's plot all line intersections present on the table's PDF page. :: - >>> tables[0].plot('joint') + >>> camelot.plot(tables[0], plot_type='joint') + >>> plt.show() .. figure:: ../_static/png/geometry_joint.png :height: 674 diff --git a/setup.cfg b/setup.cfg index 1b48058..1a59858 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ test=pytest [tool:pytest] -addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot tests -python_files = tests/test_*.py \ No newline at end of file +addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl tests +python_files = tests/test_*.py diff --git a/setup.py b/setup.py index e727706..8a1bcf6 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ dev_requires = [ 'pytest>=3.8.0', 'pytest-cov>=2.6.0', 'pytest-runner>=4.2', - 'Sphinx>=1.7.9' + 'Sphinx>=1.7.9', + 'pytest-mpl>=0.10' ] dev_requires = dev_requires + all_requires diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..a946ff7 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +import matplotlib +matplotlib.use('agg') diff --git a/tests/files/baseline_plots/test_contour_plot.png b/tests/files/baseline_plots/test_contour_plot.png new file mode 100644 index 0000000..57b3962 Binary files /dev/null and b/tests/files/baseline_plots/test_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_joint_plot.png b/tests/files/baseline_plots/test_joint_plot.png new file mode 100644 index 0000000..934aa74 Binary files /dev/null and b/tests/files/baseline_plots/test_joint_plot.png differ diff --git a/tests/files/baseline_plots/test_line_plot.png b/tests/files/baseline_plots/test_line_plot.png new file mode 100644 index 0000000..a7ac276 Binary files /dev/null and b/tests/files/baseline_plots/test_line_plot.png differ diff --git a/tests/files/baseline_plots/test_table_plot.png b/tests/files/baseline_plots/test_table_plot.png new file mode 100644 index 0000000..d60f69f Binary files /dev/null and b/tests/files/baseline_plots/test_table_plot.png differ diff --git a/tests/files/baseline_plots/test_text_plot.png b/tests/files/baseline_plots/test_text_plot.png new file mode 100644 index 0000000..8cc3825 Binary files /dev/null and b/tests/files/baseline_plots/test_text_plot.png differ diff --git a/tests/test_plotting.py b/tests/test_plotting.py new file mode 100644 index 0000000..e01cac6 --- /dev/null +++ b/tests/test_plotting.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +import os + +import pytest + +import camelot + + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_text_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + return camelot.plot(tables[0], plot_type='text') + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_table_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + return camelot.plot(tables[0], plot_type='table') + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_contour_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + return camelot.plot(tables[0], plot_type='contour') + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_line_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + return camelot.plot(tables[0], plot_type='line') + + +@pytest.mark.mpl_image_compare( + baseline_dir="files/baseline_plots", remove_text=True) +def test_joint_plot(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename) + return camelot.plot(tables[0], plot_type='joint')