Merge pull request #99 from socialcopsdev/cli

Add CLI
pull/2/head
Vinayak Mehta 2018-09-10 16:06:14 +05:30 committed by GitHub
commit 118aac47bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 254 additions and 23 deletions

View File

@ -4,6 +4,8 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
## Usage ## Usage
### API
<pre> <pre>
>>> import camelot >>> import camelot
>>> tables = camelot.read_pdf("foo.pdf") >>> tables = camelot.read_pdf("foo.pdf")
@ -23,6 +25,82 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
>>> df = tables[0].df >>> df = tables[0].df
</pre> </pre>
### Command-line interface
<pre>
Usage: camelot [OPTIONS] FILEPATH
Options:
-p, --pages TEXT Comma-separated page numbers to parse.
Example: 1,3,4 or 1,4-end
-o, --output TEXT Output filepath.
-f, --format [csv|json|excel|html]
Output file format.
-z, --zip Whether or not to create a ZIP archive.
-m, --mesh Whether or not to use Lattice method of
parsing. Stream is used by default.
-T, --table_area TEXT Table areas (x1,y1,x2,y2) to process.
x1, y1
-> left-top and x2, y2 -> right-bottom
-split, --split_text Whether or not to split text if it spans
across multiple cells.
-flag, --flag_size (inactive) Whether or not to flag text which
has uncommon size. (Useful to detect
super/subscripts)
-M, --margins <FLOAT FLOAT FLOAT>...
char_margin, line_margin, word_margin for
PDFMiner.
-C, --columns TEXT x-coordinates of column separators.
-r, --row_close_tol INTEGER Rows will be formed by combining text
vertically within this tolerance.
-c, --col_close_tol INTEGER Columns will be formed by combining text
horizontally within this tolerance.
-back, --process_background (with --mesh) Whether or not to process
lines that are in background.
-scale, --line_size_scaling INTEGER
(with --mesh) Factor by which the page
dimensions will be divided to get smallest
length of detected lines.
-copy, --copy_text [h|v] (with --mesh) Specify direction in which
text will be copied over in a spanning cell.
-shift, --shift_text [l|r|t|b] (with --mesh) Specify direction in which
text in a spanning cell should flow.
-l, --line_close_tol INTEGER (with --mesh) Tolerance parameter used to
merge close vertical lines and close
horizontal lines.
-j, --joint_close_tol INTEGER (with --mesh) Tolerance parameter used to
decide whether the detected lines and points
lie close to each other.
-block, --threshold_blocksize INTEGER
(with --mesh) For adaptive thresholding,
size of a pixel neighborhood that is used to
calculate a threshold value for the pixel:
3, 5, 7, and so on.
-const, --threshold_constant INTEGER
(with --mesh) For adaptive thresholding,
constant subtracted from the mean or
weighted mean.
Normally, it is positive but
may be zero or negative as well.
-I, --iterations INTEGER (with --mesh) Number of times for
erosion/dilation is applied.
-G, --geometry_type [text|table|contour|joint|line]
Plot geometry found on pdf page for
debugging.
text: Plot text objects. (Useful
to get table_area and columns coordinates)
table: Plot parsed table.
contour (with
--mesh): Plot detected rectangles.
joint
(with --mesh): Plot detected line
intersections.
line (with --mesh): Plot
detected lines.
--help Show this message and exit.
</pre>
## Dependencies ## Dependencies
The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/). The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).

View File

@ -1 +1,112 @@
# -*- coding: utf-8 -*-
from pprint import pprint
import click import click
from .io import read_pdf
from .plotting import plot_geometry
from .utils import validate_input, remove_extra
class Mutex(click.Option):
def handle_parse_result(self, ctx, opts, args):
mesh = opts.get('mesh', False)
geometry_type = opts.get('geometry_type', False)
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
return super(Mutex, self).handle_parse_result(ctx, opts, args)
@click.command()
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
" to parse. Example: 1,3,4 or 1,4-end")
@click.option("-o", "--output", help="Output filepath.")
@click.option("-f", "--format",
type=click.Choice(["csv", "json", "excel", "html"]),
help="Output file format.")
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
" archive.")
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
" use Lattice method of parsing. Stream is used by default.")
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
" split text if it spans across multiple cells.")
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
" not to flag text which has uncommon size. (Useful to detect"
" super/subscripts)")
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
help="char_margin, line_margin, word_margin for PDFMiner.")
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
help="x-coordinates of column separators.")
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
" formed by combining text vertically within this tolerance.")
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
" be formed by combining text horizontally within this tolerance.")
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
help="(with --mesh) Whether or not to process lines that are in"
" background.")
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
help="(with --mesh) Factor by which the page dimensions will be"
" divided to get smallest length of detected lines.")
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
" in which text will be copied over in a spanning cell.")
@click.option("-shift", "--shift_text", default=["l", "t"],
type=click.Choice(["l", "r", "t", "b"]), multiple=True, cls=Mutex,
help="(with --mesh) Specify direction in which text in a spanning"
" cell should flow.")
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to merge close vertical"
" lines and close horizontal lines.")
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.")
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
help="(with --mesh) For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel: 3, 5, 7, and so on.")
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
help="(with --mesh) For adaptive thresholding, constant subtracted"
" from the mean or weighted mean.\nNormally, it is positive but"
" may be zero or negative as well.")
@click.option("-I", "--iterations", default=0, cls=Mutex,
help="(with --mesh) Number of times for erosion/dilation is"
" applied.")
@click.option("-G", "--geometry_type",
type=click.Choice(["text", "table", "contour", "joint", "line"]),
help="Plot geometry found on pdf page for debugging.\n\n"
"text: Plot text objects. (Useful to get table_area and"
" columns coordinates)\ntable: Plot parsed table.\n"
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
" intersections.\nline (with --mesh): Plot detected lines.")
@click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs):
pages = kwargs.pop("pages")
output = kwargs.pop("output")
f = kwargs.pop("format")
compress = kwargs.pop("zip")
mesh = kwargs.pop("mesh")
geometry_type = kwargs.pop("geometry_type")
filepath = kwargs.pop("filepath")
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
kwargs = remove_extra(kwargs, mesh=mesh)
if geometry_type is None:
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
click.echo(tables)
if output is None:
raise click.UsageError("Please specify an output filepath using --output")
if f is None:
raise click.UsageError("Please specify an output format using --format")
tables.export(output, f=f, compress=compress)
else:
plot_geometry(filepath, pages=pages, mesh=mesh,
geometry_type=geometry_type, **kwargs)

View File

@ -1,4 +1,5 @@
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', mesh=False, **kwargs): def read_pdf(filepath, pages='1', mesh=False, **kwargs):
@ -18,7 +19,7 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
Whether or not to use Lattice method of parsing. Stream Whether or not to use Lattice method of parsing. Stream
is used by default. is used by default.
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to analyze as strings of the form List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space. (x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None) columns^ : list, optional (default: None)
@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
Returns Returns
------- -------
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
# validate kwargs? validate_input(kwargs, mesh=mesh)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, mesh=mesh)
tables, __ = p.parse(mesh=mesh, **kwargs) tables, __ = p.parse(mesh=mesh, **kwargs)
return tables return tables

View File

@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
"""Plot geometry found on pdf page based on type specified, """Plot geometry found on pdf page based on type specified,
useful for debugging and playing with different parameters to get useful for debugging and playing with different parameters to get
the best output. the best output.
@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
mesh : bool (default: False) mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream Whether or not to use Lattice method of parsing. Stream
is used by default. is used by default.
geometry_type : str, optional (default: 'text') geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get 'text' : Plot text objects found on page, useful to get
table_area and columns coordinates. table_area and columns coordinates.
'table' : Plot parsed table. 'table' : Plot parsed table.
@ -31,7 +32,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
'joint'* : Plot detected line intersections. 'joint'* : Plot detected line intersections.
'line'* : Plot detected lines. 'line'* : Plot detected lines.
table_area : list, optional (default: None) table_area : list, optional (default: None)
List of table areas to analyze as strings of the form List of table areas to process as strings of the form
x1,y1,x2,y2 where (x1, y1) -> left-top and x1,y1,x2,y2 where (x1, y1) -> left-top and
(x2, y2) -> right-bottom in pdf coordinate space. (x2, y2) -> right-bottom in pdf coordinate space.
columns^ : list, optional (default: None) columns^ : list, optional (default: None)
@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
""" """
# validate kwargs? validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
debug = True if geometry_type else False kwargs = remove_extra(kwargs, mesh=mesh)
debug = True if geometry_type is not None else False
kwargs.update({'debug': debug}) kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs) __, geometry = p.parse(mesh=mesh, **kwargs)
@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
[cell.lb[1], cell.rb[1]]) [cell.lb[1], cell.rb[1]])
plt.show() plt.show()
elif geometry_type == 'contour': elif geometry_type == 'contour':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images: for img, table_bbox in geometry.images:
for t in table_bbox.keys(): for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]), cv2.rectangle(img, (t[0], t[1]),
@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
elif geometry_type == 'joint': elif geometry_type == 'joint':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images: for img, table_bbox in geometry.images:
x_coord = [] x_coord = []
y_coord = [] y_coord = []
@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
elif geometry_type == 'line': elif geometry_type == 'line':
if not mesh:
raise ValueError("Use mesh=True")
for v_s, h_s in geometry.segments: for v_s, h_s in geometry.segments:
for v in v_s: for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]]) plt.plot([v[0], v[2]], [v[1], v[3]])

View File

@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical)
stream_kwargs = [
'columns',
'row_close_tol',
'col_close_tol'
]
lattice_kwargs = [
'process_background',
'line_size_scaling',
'copy_text',
'shift_text',
'line_close_tol',
'joint_close_tol',
'threshold_blocksize',
'threshold_constant',
'iterations'
]
def validate_input(kwargs, mesh=False, geometry_type=False):
def check_intersection(parser_kwargs, input_kwargs, message_bool):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
raise ValueError("{} can not be used with mesh set to {}".format(
",".join(sorted(isec)), message_bool))
if mesh:
check_intersection(stream_kwargs, kwargs, True)
else:
check_intersection(lattice_kwargs, kwargs, False)
if geometry_type:
if not mesh and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type={} with mesh set to True".format(
geometry_type))
def remove_extra(kwargs, mesh=False):
if mesh:
for key in kwargs.keys():
if key in stream_kwargs:
kwargs.pop(key)
else:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key)
return kwargs
# https://stackoverflow.com/a/22726782 # https://stackoverflow.com/a/22726782
class TemporaryDirectory(object): class TemporaryDirectory(object):
def __enter__(self): def __enter__(self):

View File

@ -49,7 +49,12 @@ def setup_package():
author_email=AUTHOR_EMAIL, author_email=AUTHOR_EMAIL,
license=LICENSE, license=LICENSE,
packages=['camelot'], packages=['camelot'],
install_requires=reqs) install_requires=reqs,
entry_points={
'console_scripts': [
'camelot = camelot.cli:cli',
],
})
try: try:
from setuptools import setup from setuptools import setup