Merge pull request #99 from socialcopsdev/cli

Add CLI
pull/2/head
Vinayak Mehta 2018-09-10 16:06:14 +05:30 committed by GitHub
commit 118aac47bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 254 additions and 23 deletions

View File

@ -4,6 +4,8 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
## Usage
### API
<pre>
>>> import camelot
>>> tables = camelot.read_pdf("foo.pdf")
@ -23,6 +25,82 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
>>> df = tables[0].df
</pre>
### Command-line interface
<pre>
Usage: camelot [OPTIONS] FILEPATH
Options:
-p, --pages TEXT Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
-o, --output TEXT Output filepath.
-f, --format [csv|json|excel|html]
Output file format.
-z, --zip Whether or not to create a ZIP archive.
-m, --mesh Whether or not to use Lattice method of
parsing. Stream is used by default.
-T, --table_area TEXT Table areas (x1,y1,x2,y2) to process.
x1, y1
-> left-top and x2, y2 -> right-bottom
-split, --split_text Whether or not to split text if it spans
across multiple cells.
-flag, --flag_size (inactive) Whether or not to flag text which
has uncommon size. (Useful to detect
super/subscripts)
-M, --margins <FLOAT FLOAT FLOAT>...
char_margin, line_margin, word_margin for
PDFMiner.
-C, --columns TEXT x-coordinates of column separators.
-r, --row_close_tol INTEGER Rows will be formed by combining text
vertically within this tolerance.
-c, --col_close_tol INTEGER Columns will be formed by combining text
horizontally within this tolerance.
-back, --process_background (with --mesh) Whether or not to process
lines that are in background.
-scale, --line_size_scaling INTEGER
(with --mesh) Factor by which the page
dimensions will be divided to get smallest
length of detected lines.
-copy, --copy_text [h|v] (with --mesh) Specify direction in which
text will be copied over in a spanning cell.
-shift, --shift_text [l|r|t|b] (with --mesh) Specify direction in which
text in a spanning cell should flow.
-l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to
merge close vertical lines and close
horizontal lines.
-j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to
decide whether the detected lines and points
lie close to each other.
-block, --threshold_blocksize INTEGER
(with --mesh) For adaptive thresholding,
size of a pixel neighborhood that is used to
calculate a threshold value for the pixel:
3, 5, 7, and so on.
-const, --threshold_constant INTEGER
(with --mesh) For adaptive thresholding,
constant subtracted from the mean or
weighted mean.
Normally, it is positive but
may be zero or negative as well.
-I, --iterations INTEGER (with --mesh) Number of times for
erosion/dilation is applied.
-G, --geometry_type [text|table|contour|joint|line]
Plot geometry found on pdf page for
debugging.
text: Plot text objects. (Useful
to get table_area and columns coordinates)
table: Plot parsed table.
contour (with
--mesh): Plot detected rectangles.
joint
(with --mesh): Plot detected line
intersections.
line (with --mesh): Plot
detected lines.
--help Show this message and exit.
</pre>
## Dependencies
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).

View File

@ -1 +1,112 @@
import click
# -*- coding: utf-8 -*-
from pprint import pprint
import click
from .io import read_pdf
from .plotting import plot_geometry
from .utils import validate_input, remove_extra
class Mutex(click.Option):
def handle_parse_result(self, ctx, opts, args):
mesh = opts.get('mesh', False)
geometry_type = opts.get('geometry_type', False)
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
return super(Mutex, self).handle_parse_result(ctx, opts, args)
@click.command()
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
" to parse. Example: 1,3,4 or 1,4-end")
@click.option("-o", "--output", help="Output filepath.")
@click.option("-f", "--format",
type=click.Choice(["csv", "json", "excel", "html"]),
help="Output file format.")
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
" archive.")
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
" use Lattice method of parsing. Stream is used by default.")
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
" split text if it spans across multiple cells.")
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
" not to flag text which has uncommon size. (Useful to detect"
" super/subscripts)")
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
help="char_margin, line_margin, word_margin for PDFMiner.")
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
help="x-coordinates of column separators.")
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
" formed by combining text vertically within this tolerance.")
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
" be formed by combining text horizontally within this tolerance.")
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
help="(with --mesh) Whether or not to process lines that are in"
" background.")
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
help="(with --mesh) Factor by which the page dimensions will be"
" divided to get smallest length of detected lines.")
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
" in which text will be copied over in a spanning cell.")
@click.option("-shift", "--shift_text", default=["l", "t"],
type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex,
help="(with --mesh) Specify direction in which text in a spanning"
" cell should flow.")
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to merge close vertical"
" lines and close horizontal lines.")
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.")
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
help="(with --mesh) For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel: 3, 5, 7, and so on.")
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
help="(with --mesh) For adaptive thresholding, constant subtracted"
" from the mean or weighted mean.\nNormally, it is positive but"
" may be zero or negative as well.")
@click.option("-I", "--iterations", default=0, cls=Mutex,
help="(with --mesh) Number of times for erosion/dilation is"
" applied.")
@click.option("-G", "--geometry_type",
type=click.Choice(["text", "table", "contour", "joint", "line"]),
help="Plot geometry found on pdf page for debugging.\n\n"
"text: Plot text objects. (Useful to get table_area and"
" columns coordinates)\ntable: Plot parsed table.\n"
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
" intersections.\nline (with --mesh): Plot detected lines.")
@click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs):
pages = kwargs.pop("pages")
output = kwargs.pop("output")
f = kwargs.pop("format")
compress = kwargs.pop("zip")
mesh = kwargs.pop("mesh")
geometry_type = kwargs.pop("geometry_type")
filepath = kwargs.pop("filepath")
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
kwargs = remove_extra(kwargs, mesh=mesh)
if geometry_type is None:
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
click.echo(tables)
if output is None:
raise click.UsageError("Please specify an output filepath using --output")
if f is None:
raise click.UsageError("Please specify an output format using --format")
tables.export(output, f=f, compress=compress)
else:
plot_geometry(filepath, pages=pages, mesh=mesh,
geometry_type=geometry_type, **kwargs)

View File

@ -1,4 +1,5 @@
from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
@ -18,7 +19,7 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
Whether or not to use Lattice method of parsing. Stream
is used by default.
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None)
@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
Returns
-------
tables : camelot.core.TableList
"""
# validate kwargs?
validate_input(kwargs, mesh=mesh)
p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, mesh=mesh)
tables, __ = p.parse(mesh=mesh, **kwargs)
return tables

View File

@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
import matplotlib.patches as patches
from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
"""Plot geometry found on pdf page based on type specified,
useful for debugging and playing with different parameters to get
the best output.
@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream
is used by default.
geometry_type : str, optional (default: 'text')
geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get
table_area and columns coordinates.
'table' : Plot parsed table.
@ -31,7 +32,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
'joint'* : Plot detected line intersections.
'line'* : Plot detected lines.
table_area : list, optional (default: None)
List of table areas to analyze as strings of the form
List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None)
@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
"""
# validate kwargs?
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
p = PDFHandler(filepath, pages)
debug = True if geometry_type else False
kwargs = remove_extra(kwargs, mesh=mesh)
debug = True if geometry_type is not None else False
kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs)
@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
[cell.lb[1], cell.rb[1]])
plt.show()
elif geometry_type == 'contour':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images:
for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]),
@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img)
plt.show()
elif geometry_type == 'joint':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images:
x_coord = []
y_coord = []
@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img)
plt.show()
elif geometry_type == 'line':
if not mesh:
raise ValueError("Use mesh=True")
for v_s, h_s in geometry.segments:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])

View File

@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
stream_kwargs = [
'columns',
'row_close_tol',
'col_close_tol'
]
lattice_kwargs = [
'process_background',
'line_size_scaling',
'copy_text',
'shift_text',
'line_close_tol',
'joint_close_tol',
'threshold_blocksize',
'threshold_constant',
'iterations'
]
def validate_input(kwargs, mesh=False, geometry_type=False):
def check_intersection(parser_kwargs, input_kwargs, message_bool):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
raise ValueError("{} can not be used with mesh set to {}".format(
",".join(sorted(isec)), message_bool))
if mesh:
check_intersection(stream_kwargs, kwargs, True)
else:
check_intersection(lattice_kwargs, kwargs, False)
if geometry_type:
if not mesh and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type={} with mesh set to True".format(
geometry_type))
def remove_extra(kwargs, mesh=False):
if mesh:
for key in kwargs.keys():
if key in stream_kwargs:
kwargs.pop(key)
else:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key)
return kwargs
# https://stackoverflow.com/a/22726782
class TemporaryDirectory(object):
def __enter__(self):

View File

@ -49,7 +49,12 @@ def setup_package():
author_email=AUTHOR_EMAIL,
license=LICENSE,
packages=['camelot'],
install_requires=reqs)
install_requires=reqs,
entry_points={
'console_scripts': [
'camelot = camelot.cli:cli',
],
})
try:
from setuptools import setup