diff --git a/camelot/cli.py b/camelot/cli.py index 302830e..61cf3d8 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -1 +1,84 @@ -import click \ No newline at end of file +# -*- coding: utf-8 -*- +from pprint import pprint + +import click + +from .io import read_pdf +from .plotting import plot_geometry +from .utils import validate_input, remove_extra + + +class Mutex(click.Option): + def handle_parse_result(self, ctx, opts, args): + mesh = opts.get('mesh', False) + geometry_type = opts.get('geometry_type', False) + validate_input(opts, mesh=mesh, geometry_type=geometry_type) + return super(Mutex, self).handle_parse_result(ctx, opts, args) + + +@click.command() +@click.option("-p", "--pages", default="1", help="") +@click.option("-o", "--output", help="") +@click.option("-f", "--format", + type=click.Choice(["csv", "json", "excel", "html"]), help="") +@click.option("-z", "--zip", is_flag=True, help="") +@click.option("-m", "--mesh", is_flag=True, help="Whether or not to" + "use Lattice method of parsing. Stream is used by default.") +@click.option("-G", "--geometry_type", + type=click.Choice(["text", "table", "contour", "joint", "line"]), + help="Plot geometry found on pdf page for debugging.") +@click.option("-T", "--table_area", default=[], multiple=True, + help="") +@click.option("-split", "--split_text", is_flag=True, help="") +@click.option("-flag", "--flag_size", is_flag=True, help="") +@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), + help="") +@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, + help="") +@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="") +@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="") +@click.option("-back", "--process_background", is_flag=True, cls=Mutex, + help="Use with --mesh") +@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex, + help="Use with --mesh") +@click.option("-copy", "--copy_text", default=[], cls=Mutex, + help="Use with --mesh") +@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex, + help="Use with --mesh") +@click.option("-l", "--line_close_tol", default=2, cls=Mutex, + help="Use with --mesh") +@click.option("-j", "--joint_close_tol", default=2, cls=Mutex, + help="Use with --mesh") +@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex, + help="Use with --mesh") +@click.option("-const", "--threshold_constant", default=-2, cls=Mutex, + help="Use with --mesh") +@click.option("-I", "--iterations", default=0, cls=Mutex, + help="Use with --mesh") +@click.argument("filepath", type=click.Path(exists=True)) +def cli(*args, **kwargs): + pages = kwargs.pop("pages") + output = kwargs.pop("output") + f = kwargs.pop("format") + compress = kwargs.pop("zip") + mesh = kwargs.pop("mesh") + geometry_type = kwargs.pop("geometry_type") + filepath = kwargs.pop("filepath") + + table_area = list(kwargs['table_area']) + kwargs['table_area'] = None if not table_area else table_area + columns = list(kwargs['columns']) + kwargs['columns'] = None if not columns else columns + + kwargs = remove_extra(kwargs, mesh=mesh) + if geometry_type is None: + tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs) + click.echo(tables) + if output is None: + raise click.UsageError("Please specify an output filepath using --output") + if f is None: + raise click.UsageError("Please specify an output format using --format") + tables.export(output, f=f, compress=compress) + else: + plot_geometry(filepath, pages=pages, mesh=mesh, + geometry_type=geometry_type, **kwargs) \ No newline at end of file diff --git a/camelot/io.py b/camelot/io.py index 33007d4..a213cee 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -1,4 +1,5 @@ from .handlers import PDFHandler +from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', mesh=False, **kwargs): @@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs): PDFMiner margins. (char_margin, line_margin, word_margin) For for information, refer `PDFMiner docs `_. - debug : bool, optional (default: False) - Whether or not to return all text objects on the page - which can be used to generate a matplotlib plot, to get - values for table_area(s) and debugging. Returns ------- tables : camelot.core.TableList """ - # validate kwargs? + validate_input(kwargs, mesh=mesh) p = PDFHandler(filepath, pages) + kwargs = remove_extra(kwargs, mesh=mesh) tables, __ = p.parse(mesh=mesh, **kwargs) return tables \ No newline at end of file diff --git a/camelot/plotting.py b/camelot/plotting.py index 2d0bb3c..6012217 100644 --- a/camelot/plotting.py +++ b/camelot/plotting.py @@ -3,9 +3,10 @@ import matplotlib.pyplot as plt import matplotlib.patches as patches from .handlers import PDFHandler +from .utils import validate_input, remove_extra -def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): +def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs): """Plot geometry found on pdf page based on type specified, useful for debugging and playing with different parameters to get the best output. @@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg mesh : bool (default: False) Whether or not to use Lattice method of parsing. Stream is used by default. - geometry_type : str, optional (default: 'text') + geometry_type : str, optional (default: None) 'text' : Plot text objects found on page, useful to get table_area and columns coordinates. 'table' : Plot parsed table. @@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg PDFMiner margins. (char_margin, line_margin, word_margin) For for information, refer `PDFMiner docs `_. - debug : bool, optional (default: False) - Whether or not to return all text objects on the page - which can be used to generate a matplotlib plot, to get - values for table_area(s) and debugging. """ - # validate kwargs? + validate_input(kwargs, mesh=mesh, geometry_type=geometry_type) p = PDFHandler(filepath, pages) - debug = True if geometry_type else False + kwargs = remove_extra(kwargs, mesh=mesh) + debug = True if geometry_type is not None else False kwargs.update({'debug': debug}) __, geometry = p.parse(mesh=mesh, **kwargs) @@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg [cell.lb[1], cell.rb[1]]) plt.show() elif geometry_type == 'contour': - if not mesh: - raise ValueError("Use mesh=True") for img, table_bbox in geometry.images: for t in table_bbox.keys(): cv2.rectangle(img, (t[0], t[1]), @@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg plt.imshow(img) plt.show() elif geometry_type == 'joint': - if not mesh: - raise ValueError("Use mesh=True") for img, table_bbox in geometry.images: x_coord = [] y_coord = [] @@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg plt.imshow(img) plt.show() elif geometry_type == 'line': - if not mesh: - raise ValueError("Use mesh=True") for v_s, h_s in geometry.segments: for v in v_s: plt.plot([v[0], v[2]], [v[1], v[3]]) diff --git a/camelot/utils.py b/camelot/utils.py index 6c29410..815f87d 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, LTTextLineVertical) +stream_kwargs = [ + 'columns', + 'row_close_tol', + 'col_close_tol' +] +lattice_kwargs = [ + 'process_background', + 'line_size_scaling', + 'copy_text', + 'shift_text', + 'line_close_tol', + 'joint_close_tol', + 'threshold_blocksize', + 'threshold_constant', + 'iterations' +] + + +def validate_input(kwargs, mesh=False, geometry_type=False): + def check_intersection(parser_kwargs, input_kwargs, message_bool): + isec = set(parser_kwargs).intersection(set(input_kwargs.keys())) + if isec: + raise ValueError("{} can not be used with mesh set to {}".format( + ",".join(sorted(isec)), message_bool)) + + if mesh: + check_intersection(stream_kwargs, kwargs, True) + else: + check_intersection(lattice_kwargs, kwargs, False) + if geometry_type: + if not mesh and geometry_type in ['contour', 'joint', 'line']: + raise ValueError("Use geometry_type={} with mesh set to True".format( + geometry_type)) + + +def remove_extra(kwargs, mesh=False): + if mesh: + for key in kwargs.keys(): + if key in stream_kwargs: + kwargs.pop(key) + else: + for key in kwargs.keys(): + if key in lattice_kwargs: + kwargs.pop(key) + return kwargs + + # https://stackoverflow.com/a/22726782 class TemporaryDirectory(object): def __enter__(self): diff --git a/setup.py b/setup.py index 14c0516..20f794f 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,12 @@ def setup_package(): author_email=AUTHOR_EMAIL, license=LICENSE, packages=['camelot'], - install_requires=reqs) + install_requires=reqs, + entry_points={ + 'console_scripts': [ + 'camelot = camelot.cli:cli', + ], + }) try: from setuptools import setup