diff --git a/README.md b/README.md index 132cd78..93b7215 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ $ conda install -c conda-forge camelot-py After [installing the dependencies](https://camelot-py.readthedocs.io/en/master/user/install.html#using-pip) ([tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/)), you can simply use pip to install Camelot:
-$ pip install camelot-py[all] +$ pip install camelot-py[cv]### From the source code @@ -87,7 +87,7 @@ and install Camelot using pip:
$ cd camelot -$ pip install ".[all]" +$ pip install ".[cv]"## Documentation diff --git a/camelot/__init__.py b/camelot/__init__.py index d8a41b9..68815f2 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -6,7 +6,7 @@ from click import HelpFormatter from .__version__ import __version__ from .io import read_pdf -from .plotting import plot +from .plotting import PlotMethods def _write_usage(self, prog, args='', prefix='Usage: '): @@ -26,3 +26,6 @@ handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) + +# instantiate plot method +plot = PlotMethods() diff --git a/camelot/cli.py b/camelot/cli.py index 8d67df7..eaae955 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -3,11 +3,14 @@ import logging import click -import matplotlib.pyplot as plt +try: + import matplotlib.pyplot as plt +except ImportError: + _HAS_MPL = False +else: + _HAS_MPL = True -from . import __version__ -from .io import read_pdf -from .plotting import plot +from . import __version__, read_pdf, plot logger = logging.getLogger('camelot') @@ -82,7 +85,7 @@ def cli(ctx, *args, **kwargs): @click.option('-I', '--iterations', default=0, help='Number of times for erosion/dilation will be applied.') @click.option('-plot', '--plot_type', - type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), + type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config @@ -104,18 +107,23 @@ def lattice(c, *args, **kwargs): kwargs['copy_text'] = None if not copy_text else copy_text kwargs['shift_text'] = list(kwargs['shift_text']) - tables = read_pdf(filepath, pages=pages, flavor='lattice', - suppress_warnings=suppress_warnings, **kwargs) - click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: - for table in tables: - plot(table, plot_type=plot_type) - plt.show() + if not _HAS_MPL: + raise ImportError('matplotlib is required for plotting.') else: if output is None: raise click.UsageError('Please specify output file path using --output') if f is None: raise click.UsageError('Please specify output file format using --format') + + tables = read_pdf(filepath, pages=pages, flavor='lattice', + suppress_warnings=suppress_warnings, **kwargs) + click.echo('Found {} tables'.format(tables.n)) + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: tables.export(output, f=f, compress=compress) @@ -130,7 +138,7 @@ def lattice(c, *args, **kwargs): @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', - type=click.Choice(['text', 'table']), + type=click.Choice(['text', 'grid']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config @@ -151,16 +159,21 @@ def stream(c, *args, **kwargs): columns = list(kwargs['columns']) kwargs['columns'] = None if not columns else columns - tables = read_pdf(filepath, pages=pages, flavor='stream', - suppress_warnings=suppress_warnings, **kwargs) - click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: - for table in tables: - plot(table, plot_type=plot_type) - plt.show() + if not _HAS_MPL: + raise ImportError('matplotlib is required for plotting.') else: if output is None: raise click.UsageError('Please specify output file path using --output') if f is None: raise click.UsageError('Please specify output file format using --format') + + tables = read_pdf(filepath, pages=pages, flavor='stream', + suppress_warnings=suppress_warnings, **kwargs) + click.echo('Found {} tables'.format(tables.n)) + if plot_type is not None: + for table in tables: + plot(table, kind=plot_type) + plt.show() + else: tables.export(output, f=f, compress=compress) diff --git a/camelot/plotting.py b/camelot/plotting.py index 73d5b37..3b91cee 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -1,185 +1,179 @@ # -*- coding: utf-8 -*- -import matplotlib.pyplot as plt -import matplotlib.patches as patches +try: + import matplotlib.pyplot as plt + import matplotlib.patches as patches +except ImportError: + _HAS_MPL = False +else: + _HAS_MPL = True -def plot(table, plot_type='text', filepath=None): - """Plot elements found on PDF page based on plot_type - specified, useful for debugging and playing with different - parameters to get the best output. +class PlotMethods(object): + def __call__(self, table, kind='text', filename=None): + """Plot elements found on PDF page based on kind + specified, useful for debugging and playing with different + parameters to get the best output. - Parameters - ---------- - table: Table - A Camelot Table. - plot_type : str, optional (default: 'text') - {'text', 'table', 'contour', 'joint', 'line'} - The element type for which a plot should be generated. - filepath: str, optional (default: None) - Absolute path for saving the generated plot. + Parameters + ---------- + table: camelot.core.Table + A Camelot Table. + kind : str, optional (default: 'text') + {'text', 'grid', 'contour', 'joint', 'line'} + The element type for which a plot should be generated. + filepath: str, optional (default: None) + Absolute path for saving the generated plot. - Returns - ------- - fig : matplotlib.fig.Figure + Returns + ------- + fig : matplotlib.fig.Figure - """ - if table.flavor == 'stream' and plot_type in ['contour', 'joint', 'line']: - raise NotImplementedError("{} cannot be plotted with flavor='stream'".format( - plot_type)) - if plot_type == 'text': - fig = plot_text(table._text) - elif plot_type == 'table': - fig = plot_table(table) - elif plot_type == 'contour': - fig = plot_contour(table._image) - elif plot_type == 'joint': - fig = plot_joint(table._image) - elif plot_type == 'line': - fig = plot_line(table._segments) - if filepath: - plt.savefig(filepath) - return fig + """ + if not _HAS_MPL: + raise ImportError('matplotlib is required for plotting.') + if table.flavor == 'stream' and kind in ['contour', 'joint', 'line']: + raise NotImplementedError("Stream flavor does not support kind='{}'".format( + kind)) -def plot_text(text): - """Generates a plot for all text elements present - on the PDF page. + plot_method = getattr(self, kind) + return plot_method(table) - Parameters - ---------- - text : list + def text(self, table): + """Generates a plot for all text elements present + on the PDF page. - Returns - ------- - fig : matplotlib.fig.Figure + Parameters + ---------- + table : camelot.core.Table - """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in text: - xs.extend([t[0], t[2]]) - ys.extend([t[1], t[3]]) - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1] + Returns + ------- + fig : matplotlib.fig.Figure + + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in table._text: + xs.extend([t[0], t[2]]) + ys.extend([t[1], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1] + ) ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - return fig + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + return fig + def grid(self, table): + """Generates a plot for the detected table grids + on the PDF page. -def plot_table(table): - """Generates a plot for the detected tables - on the PDF page. + Parameters + ---------- + table : camelot.core.Table - Parameters - ---------- - table : camelot.core.Table + Returns + ------- + fig : matplotlib.fig.Figure - Returns - ------- - fig : matplotlib.fig.Figure + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + for row in table.cells: + for cell in row: + if cell.left: + ax.plot([cell.lb[0], cell.lt[0]], + [cell.lb[1], cell.lt[1]]) + if cell.right: + ax.plot([cell.rb[0], cell.rt[0]], + [cell.rb[1], cell.rt[1]]) + if cell.top: + ax.plot([cell.lt[0], cell.rt[0]], + [cell.lt[1], cell.rt[1]]) + if cell.bottom: + ax.plot([cell.lb[0], cell.rb[0]], + [cell.lb[1], cell.rb[1]]) + return fig - """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - for row in table.cells: - for cell in row: - if cell.left: - ax.plot([cell.lb[0], cell.lt[0]], - [cell.lb[1], cell.lt[1]]) - if cell.right: - ax.plot([cell.rb[0], cell.rt[0]], - [cell.rb[1], cell.rt[1]]) - if cell.top: - ax.plot([cell.lt[0], cell.rt[0]], - [cell.lt[1], cell.rt[1]]) - if cell.bottom: - ax.plot([cell.lb[0], cell.rb[0]], - [cell.lb[1], cell.rb[1]]) - return fig + def contour(self, table): + """Generates a plot for all table boundaries present + on the PDF page. + Parameters + ---------- + table : camelot.core.Table -def plot_contour(image): - """Generates a plot for all table boundaries present - on the PDF page. + Returns + ------- + fig : matplotlib.fig.Figure - Parameters - ---------- - image : tuple - - Returns - ------- - fig : matplotlib.fig.Figure - - """ - img, table_bbox = image - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - for t in table_bbox.keys(): - ax.add_patch( - patches.Rectangle( - (t[0], t[1]), - t[2] - t[0], - t[3] - t[1], - fill=None, - edgecolor='red' + """ + img, table_bbox = table._image + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + for t in table_bbox.keys(): + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1], + fill=None, + edgecolor='red' + ) ) - ) - ax.imshow(img) - return fig + ax.imshow(img) + return fig + def joint(self, table): + """Generates a plot for all line intersections present + on the PDF page. -def plot_joint(image): - """Generates a plot for all line intersections present - on the PDF page. + Parameters + ---------- + table : camelot.core.Table - Parameters - ---------- - image : tuple + Returns + ------- + fig : matplotlib.fig.Figure - Returns - ------- - fig : matplotlib.fig.Figure + """ + img, table_bbox = table._image + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + x_coord = [] + y_coord = [] + for k in table_bbox.keys(): + for coord in table_bbox[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + ax.plot(x_coord, y_coord, 'ro') + ax.imshow(img) + return fig - """ - img, table_bbox = image - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - x_coord = [] - y_coord = [] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - ax.plot(x_coord, y_coord, 'ro') - ax.imshow(img) - return fig + def line(self, table): + """Generates a plot for all line segments present + on the PDF page. + Parameters + ---------- + table : camelot.core.Table -def plot_line(segments): - """Generates a plot for all line segments present - on the PDF page. + Returns + ------- + fig : matplotlib.fig.Figure - Parameters - ---------- - segments : tuple - - Returns - ------- - fig : matplotlib.fig.Figure - - """ - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - vertical, horizontal = segments - for v in vertical: - ax.plot([v[0], v[2]], [v[1], v[3]]) - for h in horizontal: - ax.plot([h[0], h[2]], [h[1], h[3]]) - return fig + """ + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + vertical, horizontal = table._segments + for v in vertical: + ax.plot([v[0], v[2]], [v[1], v[3]]) + for h in horizontal: + ax.plot([h[0], h[2]], [h[1], h[3]]) + return fig diff --git a/docs/_static/png/geometry_contour.png b/docs/_static/png/plot_contour.png similarity index 100% rename from docs/_static/png/geometry_contour.png rename to docs/_static/png/plot_contour.png diff --git a/docs/_static/png/geometry_joint.png b/docs/_static/png/plot_joint.png similarity index 100% rename from docs/_static/png/geometry_joint.png rename to docs/_static/png/plot_joint.png diff --git a/docs/_static/png/geometry_line.png b/docs/_static/png/plot_line.png similarity index 100% rename from docs/_static/png/geometry_line.png rename to docs/_static/png/plot_line.png diff --git a/docs/_static/png/geometry_table.png b/docs/_static/png/plot_table.png similarity index 100% rename from docs/_static/png/geometry_table.png rename to docs/_static/png/plot_table.png diff --git a/docs/_static/png/geometry_text.png b/docs/_static/png/plot_text.png similarity index 100% rename from docs/_static/png/geometry_text.png rename to docs/_static/png/plot_text.png diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 7d6b349..4d5c4de 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -30,12 +30,14 @@ To process background lines, you can pass ``process_background=True``. Visual debugging ---------------- -You can use the :meth:`plot()