diff --git a/camelot/cli.py b/camelot/cli.py
index 302830e..61cf3d8 100644
--- a/camelot/cli.py
+++ b/camelot/cli.py
@@ -1 +1,84 @@
-import click
\ No newline at end of file
+# -*- coding: utf-8 -*-
+from pprint import pprint
+
+import click
+
+from .io import read_pdf
+from .plotting import plot_geometry
+from .utils import validate_input, remove_extra
+
+
+class Mutex(click.Option):
+ def handle_parse_result(self, ctx, opts, args):
+ mesh = opts.get('mesh', False)
+ geometry_type = opts.get('geometry_type', False)
+ validate_input(opts, mesh=mesh, geometry_type=geometry_type)
+ return super(Mutex, self).handle_parse_result(ctx, opts, args)
+
+
+@click.command()
+@click.option("-p", "--pages", default="1", help="")
+@click.option("-o", "--output", help="")
+@click.option("-f", "--format",
+ type=click.Choice(["csv", "json", "excel", "html"]), help="")
+@click.option("-z", "--zip", is_flag=True, help="")
+@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
+ "use Lattice method of parsing. Stream is used by default.")
+@click.option("-G", "--geometry_type",
+ type=click.Choice(["text", "table", "contour", "joint", "line"]),
+ help="Plot geometry found on pdf page for debugging.")
+@click.option("-T", "--table_area", default=[], multiple=True,
+ help="")
+@click.option("-split", "--split_text", is_flag=True, help="")
+@click.option("-flag", "--flag_size", is_flag=True, help="")
+@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
+ help="")
+@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
+ help="")
+@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="")
+@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="")
+@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-copy", "--copy_text", default=[], cls=Mutex,
+ help="Use with --mesh")
+@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex,
+ help="Use with --mesh")
+@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
+ help="Use with --mesh")
+@click.option("-I", "--iterations", default=0, cls=Mutex,
+ help="Use with --mesh")
+@click.argument("filepath", type=click.Path(exists=True))
+def cli(*args, **kwargs):
+ pages = kwargs.pop("pages")
+ output = kwargs.pop("output")
+ f = kwargs.pop("format")
+ compress = kwargs.pop("zip")
+ mesh = kwargs.pop("mesh")
+ geometry_type = kwargs.pop("geometry_type")
+ filepath = kwargs.pop("filepath")
+
+ table_area = list(kwargs['table_area'])
+ kwargs['table_area'] = None if not table_area else table_area
+ columns = list(kwargs['columns'])
+ kwargs['columns'] = None if not columns else columns
+
+ kwargs = remove_extra(kwargs, mesh=mesh)
+ if geometry_type is None:
+ tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
+ click.echo(tables)
+ if output is None:
+ raise click.UsageError("Please specify an output filepath using --output")
+ if f is None:
+ raise click.UsageError("Please specify an output format using --format")
+ tables.export(output, f=f, compress=compress)
+ else:
+ plot_geometry(filepath, pages=pages, mesh=mesh,
+ geometry_type=geometry_type, **kwargs)
\ No newline at end of file
diff --git a/camelot/io.py b/camelot/io.py
index 33007d4..a213cee 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -1,4 +1,5 @@
from .handlers import PDFHandler
+from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
@@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs `_.
- debug : bool, optional (default: False)
- Whether or not to return all text objects on the page
- which can be used to generate a matplotlib plot, to get
- values for table_area(s) and debugging.
Returns
-------
tables : camelot.core.TableList
"""
- # validate kwargs?
+ validate_input(kwargs, mesh=mesh)
p = PDFHandler(filepath, pages)
+ kwargs = remove_extra(kwargs, mesh=mesh)
tables, __ = p.parse(mesh=mesh, **kwargs)
return tables
\ No newline at end of file
diff --git a/camelot/plotting.py b/camelot/plotting.py
index 2d0bb3c..6012217 100644
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
import matplotlib.patches as patches
from .handlers import PDFHandler
+from .utils import validate_input, remove_extra
-def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
+def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
"""Plot geometry found on pdf page based on type specified,
useful for debugging and playing with different parameters to get
the best output.
@@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream
is used by default.
- geometry_type : str, optional (default: 'text')
+ geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get
table_area and columns coordinates.
'table' : Plot parsed table.
@@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs `_.
- debug : bool, optional (default: False)
- Whether or not to return all text objects on the page
- which can be used to generate a matplotlib plot, to get
- values for table_area(s) and debugging.
"""
- # validate kwargs?
+ validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
p = PDFHandler(filepath, pages)
- debug = True if geometry_type else False
+ kwargs = remove_extra(kwargs, mesh=mesh)
+ debug = True if geometry_type is not None else False
kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs)
@@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
[cell.lb[1], cell.rb[1]])
plt.show()
elif geometry_type == 'contour':
- if not mesh:
- raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images:
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
@@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img)
plt.show()
elif geometry_type == 'joint':
- if not mesh:
- raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images:
x_coord = []
y_coord = []
@@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img)
plt.show()
elif geometry_type == 'line':
- if not mesh:
- raise ValueError("Use mesh=True")
for v_s, h_s in geometry.segments:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
diff --git a/camelot/utils.py b/camelot/utils.py
index 6c29410..815f87d 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
+stream_kwargs = [
+ 'columns',
+ 'row_close_tol',
+ 'col_close_tol'
+]
+lattice_kwargs = [
+ 'process_background',
+ 'line_size_scaling',
+ 'copy_text',
+ 'shift_text',
+ 'line_close_tol',
+ 'joint_close_tol',
+ 'threshold_blocksize',
+ 'threshold_constant',
+ 'iterations'
+]
+
+
+def validate_input(kwargs, mesh=False, geometry_type=False):
+ def check_intersection(parser_kwargs, input_kwargs, message_bool):
+ isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
+ if isec:
+ raise ValueError("{} can not be used with mesh set to {}".format(
+ ",".join(sorted(isec)), message_bool))
+
+ if mesh:
+ check_intersection(stream_kwargs, kwargs, True)
+ else:
+ check_intersection(lattice_kwargs, kwargs, False)
+ if geometry_type:
+ if not mesh and geometry_type in ['contour', 'joint', 'line']:
+ raise ValueError("Use geometry_type={} with mesh set to True".format(
+ geometry_type))
+
+
+def remove_extra(kwargs, mesh=False):
+ if mesh:
+ for key in kwargs.keys():
+ if key in stream_kwargs:
+ kwargs.pop(key)
+ else:
+ for key in kwargs.keys():
+ if key in lattice_kwargs:
+ kwargs.pop(key)
+ return kwargs
+
+
# https://stackoverflow.com/a/22726782
class TemporaryDirectory(object):
def __enter__(self):
diff --git a/setup.py b/setup.py
index 14c0516..20f794f 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,12 @@ def setup_package():
author_email=AUTHOR_EMAIL,
license=LICENSE,
packages=['camelot'],
- install_requires=reqs)
+ install_requires=reqs,
+ entry_points={
+ 'console_scripts': [
+ 'camelot = camelot.cli:cli',
+ ],
+ })
try:
from setuptools import setup