Merge pull request #99 from socialcopsdev/cli

Add CLI
2018-09-10 16:06:14 +05:30 · 2018-09-10 16:06:14 +05:30 · 118aac47bc
parent 1b013178a8 544e0c9c3f
commit 118aac47bc
6 changed files with 254 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -4,6 +4,8 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat

 ## Usage

+### API
+
 <pre>
 >>> import camelot
 >>> tables = camelot.read_pdf("foo.pdf")
@ -23,6 +25,82 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
 >>> df = tables[0].df
 </pre>

+### Command-line interface
+
+<pre>
+Usage: camelot [OPTIONS] FILEPATH
+
+Options:
+  -p, --pages TEXT                Comma-separated page numbers to parse.
+                                  Example: 1,3,4 or 1,4-end
+  -o, --output TEXT               Output filepath.
+  -f, --format [csv|json|excel|html]
+                                  Output file format.
+  -z, --zip                       Whether or not to create a ZIP archive.
+  -m, --mesh                      Whether or not to use Lattice method of
+                                  parsing. Stream is used by default.
+  -T, --table_area TEXT           Table areas (x1,y1,x2,y2) to process.
+                                  x1, y1
+                                  -> left-top and x2, y2 -> right-bottom
+  -split, --split_text            Whether or not to split text if it spans
+                                  across multiple cells.
+  -flag, --flag_size              (inactive) Whether or not to flag text which
+                                  has uncommon size. (Useful to detect
+                                  super/subscripts)
+  -M, --margins <FLOAT FLOAT FLOAT>...
+                                  char_margin, line_margin, word_margin for
+                                  PDFMiner.
+  -C, --columns TEXT              x-coordinates of column separators.
+  -r, --row_close_tol INTEGER     Rows will be formed by combining text
+                                  vertically within this tolerance.
+  -c, --col_close_tol INTEGER     Columns will be formed by combining text
+                                  horizontally within this tolerance.
+  -back, --process_background     (with --mesh) Whether or not to process
+                                  lines that are in background.
+  -scale, --line_size_scaling INTEGER
+                                  (with --mesh) Factor by which the page
+                                  dimensions will be divided to get smallest
+                                  length of detected lines.
+  -copy, --copy_text [h|v]        (with --mesh) Specify direction in which
+                                  text will be copied over in a spanning cell.
+  -shift, --shift_text [l|r|t|b]  (with --mesh) Specify direction in which
+                                  text in a spanning cell should flow.
+  -l, --line_close_tol INTEGER    (with --mesh) Tolerance parameter used to
+                                  merge close vertical lines and close
+                                  horizontal lines.
+  -j, --joint_close_tol INTEGER   (with --mesh) Tolerance parameter used to
+                                  decide whether the detected lines and points
+                                  lie close to each other.
+  -block, --threshold_blocksize INTEGER
+                                  (with --mesh) For adaptive thresholding,
+                                  size of a pixel neighborhood that is used to
+                                  calculate a threshold value for the pixel:
+                                  3, 5, 7, and so on.
+  -const, --threshold_constant INTEGER
+                                  (with --mesh) For adaptive thresholding,
+                                  constant subtracted from the mean or
+                                  weighted mean.
+                                  Normally, it is positive but
+                                  may be zero or negative as well.
+  -I, --iterations INTEGER        (with --mesh) Number of times for
+                                  erosion/dilation is applied.
+  -G, --geometry_type [text|table|contour|joint|line]
+                                  Plot geometry found on pdf page for
+                                  debugging.
+
+                                  text: Plot text objects. (Useful
+                                  to get table_area and columns coordinates)
+                                  table: Plot parsed table.
+                                  contour (with
+                                  --mesh): Plot detected rectangles.
+                                  joint
+                                  (with --mesh): Plot detected line
+                                  intersections.
+                                  line (with --mesh): Plot
+                                  detected lines.
+  --help                          Show this message and exit.
+</pre>
+
 ## Dependencies

 The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -1 +1,112 @@
-import click
+# -*- coding: utf-8 -*-
+from pprint import pprint
+
+import click
+
+from .io import read_pdf
+from .plotting import plot_geometry
+from .utils import validate_input, remove_extra
+
+
+class Mutex(click.Option):
+    def handle_parse_result(self, ctx, opts, args):
+        mesh = opts.get('mesh', False)
+        geometry_type = opts.get('geometry_type', False)
+        validate_input(opts, mesh=mesh, geometry_type=geometry_type)
+        return super(Mutex, self).handle_parse_result(ctx, opts, args)
+
+
+@click.command()
+@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
+              " to parse. Example: 1,3,4 or 1,4-end")
+@click.option("-o", "--output", help="Output filepath.")
+@click.option("-f", "--format",
+              type=click.Choice(["csv", "json", "excel", "html"]),
+              help="Output file format.")
+@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
+              " archive.")
+@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
+              " use Lattice method of parsing. Stream is used by default.")
+@click.option("-T", "--table_area", default=[], multiple=True,
+              help="Table areas (x1,y1,x2,y2) to process.\n"
+              " x1, y1 -> left-top and x2, y2 -> right-bottom")
+@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
+              " split text if it spans across multiple cells.")
+@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
+              " not to flag text which has uncommon size. (Useful to detect"
+              " super/subscripts)")
+@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
+              help="char_margin, line_margin, word_margin for PDFMiner.")
+@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
+              help="x-coordinates of column separators.")
+@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
+              " formed by combining text vertically within this tolerance.")
+@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
+              " be formed by combining text horizontally within this tolerance.")
+@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
+              help="(with --mesh) Whether or not to process lines that are in"
+              " background.")
+@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
+              help="(with --mesh) Factor by which the page dimensions will be"
+              " divided to get smallest length of detected lines.")
+@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
+              multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
+              " in which text will be copied over in a spanning cell.")
+@click.option("-shift", "--shift_text", default=["l", "t"],
+              type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex,
+              help="(with --mesh) Specify direction in which text in a spanning"
+              " cell should flow.")
+@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
+              help="(with --mesh) Tolerance parameter used to merge close vertical"
+              " lines and close horizontal lines.")
+@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
+              help="(with --mesh) Tolerance parameter used to decide whether"
+              " the detected lines and points lie close to each other.")
+@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
+              help="(with --mesh) For adaptive thresholding, size of a pixel"
+              " neighborhood that is used to calculate a threshold value for"
+              " the pixel: 3, 5, 7, and so on.")
+@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
+              help="(with --mesh) For adaptive thresholding, constant subtracted"
+              " from the mean or weighted mean.\nNormally, it is positive but"
+              " may be zero or negative as well.")
+@click.option("-I", "--iterations", default=0, cls=Mutex,
+              help="(with --mesh) Number of times for erosion/dilation is"
+              " applied.")
+@click.option("-G", "--geometry_type",
+              type=click.Choice(["text", "table", "contour", "joint", "line"]),
+              help="Plot geometry found on pdf page for debugging.\n\n"
+              "text: Plot text objects. (Useful to get table_area and"
+              " columns coordinates)\ntable: Plot parsed table.\n"
+              "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
+              " intersections.\nline (with --mesh): Plot detected lines.")
+@click.argument("filepath", type=click.Path(exists=True))
+def cli(*args, **kwargs):
+    pages = kwargs.pop("pages")
+    output = kwargs.pop("output")
+    f = kwargs.pop("format")
+    compress = kwargs.pop("zip")
+    mesh = kwargs.pop("mesh")
+    geometry_type = kwargs.pop("geometry_type")
+    filepath = kwargs.pop("filepath")
+
+    table_area = list(kwargs['table_area'])
+    kwargs['table_area'] = None if not table_area else table_area
+    columns = list(kwargs['columns'])
+    kwargs['columns'] = None if not columns else columns
+    copy_text = list(kwargs['copy_text'])
+    kwargs['copy_text'] = None if not copy_text else copy_text
+    kwargs['shift_text'] = list(kwargs['shift_text'])
+
+    kwargs = remove_extra(kwargs, mesh=mesh)
+    if geometry_type is None:
+        tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
+        click.echo(tables)
+        if output is None:
+            raise click.UsageError("Please specify an output filepath using --output")
+        if f is None:
+            raise click.UsageError("Please specify an output format using --format")
+        tables.export(output, f=f, compress=compress)
+    else:
+        plot_geometry(filepath, pages=pages, mesh=mesh,
+                      geometry_type=geometry_type, **kwargs)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -1,4 +1,5 @@
 from .handlers import PDFHandler
+from .utils import validate_input, remove_extra


 def read_pdf(filepath, pages='1', mesh=False, **kwargs):
@ -18,7 +19,7 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
        Whether or not to use Lattice method of parsing. Stream
        is used by default.
    table_area : list, optional (default: None)
-        List of table areas to analyze as strings of the form
+        List of table areas to process as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    columns^ : list, optional (default: None)
@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
        PDFMiner margins. (char_margin, line_margin, word_margin)

        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
-    debug : bool, optional (default: False)
-        Whether or not to return all text objects on the page
-        which can be used to generate a matplotlib plot, to get
-        values for table_area(s) and debugging.

    Returns
    -------
    tables : camelot.core.TableList

    """
-    # validate kwargs?
+    validate_input(kwargs, mesh=mesh)
    p = PDFHandler(filepath, pages)
+    kwargs = remove_extra(kwargs, mesh=mesh)
    tables, __ = p.parse(mesh=mesh, **kwargs)
    return tables
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
 import matplotlib.patches as patches

 from .handlers import PDFHandler
+from .utils import validate_input, remove_extra


-def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
+def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
    """Plot geometry found on pdf page based on type specified,
    useful for debugging and playing with different parameters to get
    the best output.
@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
    mesh : bool (default: False)
        Whether or not to use Lattice method of parsing. Stream
        is used by default.
-    geometry_type : str, optional (default: 'text')
+    geometry_type : str, optional (default: None)
        'text' : Plot text objects found on page, useful to get
                 table_area and columns coordinates.
        'table' : Plot parsed table.
@ -31,7 +32,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
        'joint'* : Plot detected line intersections.
        'line'* : Plot detected lines.
    table_area : list, optional (default: None)
-        List of table areas to analyze as strings of the form
+        List of table areas to process as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    columns^ : list, optional (default: None)
@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
        PDFMiner margins. (char_margin, line_margin, word_margin)

        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
-    debug : bool, optional (default: False)
-        Whether or not to return all text objects on the page
-        which can be used to generate a matplotlib plot, to get
-        values for table_area(s) and debugging.

    """
-    # validate kwargs?
+    validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
    p = PDFHandler(filepath, pages)
-    debug = True if geometry_type else False
+    kwargs = remove_extra(kwargs, mesh=mesh)
+    debug = True if geometry_type is not None else False
    kwargs.update({'debug': debug})
    __, geometry = p.parse(mesh=mesh, **kwargs)

@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
                                     [cell.lb[1], cell.rb[1]])
            plt.show()
    elif geometry_type == 'contour':
-        if not mesh:
-            raise ValueError("Use mesh=True")
        for img, table_bbox in geometry.images:
            for t in table_bbox.keys():
                cv2.rectangle(img, (t[0], t[1]),
@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
            plt.imshow(img)
            plt.show()
    elif geometry_type == 'joint':
-        if not mesh:
-            raise ValueError("Use mesh=True")
        for img, table_bbox in geometry.images:
            x_coord = []
            y_coord = []
@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
            plt.imshow(img)
            plt.show()
    elif geometry_type == 'line':
-        if not mesh:
-            raise ValueError("Use mesh=True")
        for v_s, h_s in geometry.segments:
            for v in v_s:
                plt.plot([v[0], v[2]], [v[1], v[3]])
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
                             LTTextLineVertical)


+stream_kwargs = [
+    'columns',
+    'row_close_tol',
+    'col_close_tol'
+]
+lattice_kwargs = [
+    'process_background',
+    'line_size_scaling',
+    'copy_text',
+    'shift_text',
+    'line_close_tol',
+    'joint_close_tol',
+    'threshold_blocksize',
+    'threshold_constant',
+    'iterations'
+]
+
+
+def validate_input(kwargs, mesh=False, geometry_type=False):
+    def check_intersection(parser_kwargs, input_kwargs, message_bool):
+        isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
+        if isec:
+            raise ValueError("{} can not be used with mesh set to {}".format(
+                             ",".join(sorted(isec)), message_bool))
+
+    if mesh:
+        check_intersection(stream_kwargs, kwargs, True)
+    else:
+        check_intersection(lattice_kwargs, kwargs, False)
+    if geometry_type:
+        if not mesh and geometry_type in ['contour', 'joint', 'line']:
+            raise ValueError("Use geometry_type={} with mesh set to True".format(
+                             geometry_type))
+
+
+def remove_extra(kwargs, mesh=False):
+    if mesh:
+        for key in kwargs.keys():
+            if key in stream_kwargs:
+                kwargs.pop(key)
+    else:
+        for key in kwargs.keys():
+            if key in lattice_kwargs:
+                kwargs.pop(key)
+    return kwargs
+
+
 # https://stackoverflow.com/a/22726782
 class TemporaryDirectory(object):
    def __enter__(self):
--- a/setup.py
+++ b/setup.py
@ -49,7 +49,12 @@ def setup_package():
                    author_email=AUTHOR_EMAIL,
                    license=LICENSE,
                    packages=['camelot'],
-                    install_requires=reqs)
+                    install_requires=reqs,
+                    entry_points={
+                        'console_scripts': [
+                            'camelot = camelot.cli:cli',
+                        ],
+                    })

    try:
        from setuptools import setup