Add CLI
parent
fcef880e6c
commit
7bb1aee9b6
|
|
@ -1 +1,84 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
|
from .io import read_pdf
|
||||||
|
from .plotting import plot_geometry
|
||||||
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
|
class Mutex(click.Option):
|
||||||
|
def handle_parse_result(self, ctx, opts, args):
|
||||||
|
mesh = opts.get('mesh', False)
|
||||||
|
geometry_type = opts.get('geometry_type', False)
|
||||||
|
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
|
||||||
|
return super(Mutex, self).handle_parse_result(ctx, opts, args)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("-p", "--pages", default="1", help="")
|
||||||
|
@click.option("-o", "--output", help="")
|
||||||
|
@click.option("-f", "--format",
|
||||||
|
type=click.Choice(["csv", "json", "excel", "html"]), help="")
|
||||||
|
@click.option("-z", "--zip", is_flag=True, help="")
|
||||||
|
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
|
||||||
|
"use Lattice method of parsing. Stream is used by default.")
|
||||||
|
@click.option("-G", "--geometry_type",
|
||||||
|
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
||||||
|
help="Plot geometry found on pdf page for debugging.")
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="")
|
||||||
|
@click.option("-split", "--split_text", is_flag=True, help="")
|
||||||
|
@click.option("-flag", "--flag_size", is_flag=True, help="")
|
||||||
|
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
|
help="")
|
||||||
|
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
||||||
|
help="")
|
||||||
|
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="")
|
||||||
|
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="")
|
||||||
|
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-copy", "--copy_text", default=[], cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
||||||
|
help="Use with --mesh")
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
|
def cli(*args, **kwargs):
|
||||||
|
pages = kwargs.pop("pages")
|
||||||
|
output = kwargs.pop("output")
|
||||||
|
f = kwargs.pop("format")
|
||||||
|
compress = kwargs.pop("zip")
|
||||||
|
mesh = kwargs.pop("mesh")
|
||||||
|
geometry_type = kwargs.pop("geometry_type")
|
||||||
|
filepath = kwargs.pop("filepath")
|
||||||
|
|
||||||
|
table_area = list(kwargs['table_area'])
|
||||||
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
|
columns = list(kwargs['columns'])
|
||||||
|
kwargs['columns'] = None if not columns else columns
|
||||||
|
|
||||||
|
kwargs = remove_extra(kwargs, mesh=mesh)
|
||||||
|
if geometry_type is None:
|
||||||
|
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
|
||||||
|
click.echo(tables)
|
||||||
|
if output is None:
|
||||||
|
raise click.UsageError("Please specify an output filepath using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError("Please specify an output format using --format")
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
else:
|
||||||
|
plot_geometry(filepath, pages=pages, mesh=mesh,
|
||||||
|
geometry_type=geometry_type, **kwargs)
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from .handlers import PDFHandler
|
from .handlers import PDFHandler
|
||||||
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
|
|
@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
|
||||||
Whether or not to return all text objects on the page
|
|
||||||
which can be used to generate a matplotlib plot, to get
|
|
||||||
values for table_area(s) and debugging.
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
tables : camelot.core.TableList
|
tables : camelot.core.TableList
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# validate kwargs?
|
validate_input(kwargs, mesh=mesh)
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
|
kwargs = remove_extra(kwargs, mesh=mesh)
|
||||||
tables, __ = p.parse(mesh=mesh, **kwargs)
|
tables, __ = p.parse(mesh=mesh, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
|
||||||
import matplotlib.patches as patches
|
import matplotlib.patches as patches
|
||||||
|
|
||||||
from .handlers import PDFHandler
|
from .handlers import PDFHandler
|
||||||
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
|
def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
|
||||||
"""Plot geometry found on pdf page based on type specified,
|
"""Plot geometry found on pdf page based on type specified,
|
||||||
useful for debugging and playing with different parameters to get
|
useful for debugging and playing with different parameters to get
|
||||||
the best output.
|
the best output.
|
||||||
|
|
@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
|
||||||
mesh : bool (default: False)
|
mesh : bool (default: False)
|
||||||
Whether or not to use Lattice method of parsing. Stream
|
Whether or not to use Lattice method of parsing. Stream
|
||||||
is used by default.
|
is used by default.
|
||||||
geometry_type : str, optional (default: 'text')
|
geometry_type : str, optional (default: None)
|
||||||
'text' : Plot text objects found on page, useful to get
|
'text' : Plot text objects found on page, useful to get
|
||||||
table_area and columns coordinates.
|
table_area and columns coordinates.
|
||||||
'table' : Plot parsed table.
|
'table' : Plot parsed table.
|
||||||
|
|
@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
|
||||||
PDFMiner margins. (char_margin, line_margin, word_margin)
|
PDFMiner margins. (char_margin, line_margin, word_margin)
|
||||||
|
|
||||||
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
|
||||||
debug : bool, optional (default: False)
|
|
||||||
Whether or not to return all text objects on the page
|
|
||||||
which can be used to generate a matplotlib plot, to get
|
|
||||||
values for table_area(s) and debugging.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# validate kwargs?
|
validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages)
|
||||||
debug = True if geometry_type else False
|
kwargs = remove_extra(kwargs, mesh=mesh)
|
||||||
|
debug = True if geometry_type is not None else False
|
||||||
kwargs.update({'debug': debug})
|
kwargs.update({'debug': debug})
|
||||||
__, geometry = p.parse(mesh=mesh, **kwargs)
|
__, geometry = p.parse(mesh=mesh, **kwargs)
|
||||||
|
|
||||||
|
|
@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
|
||||||
[cell.lb[1], cell.rb[1]])
|
[cell.lb[1], cell.rb[1]])
|
||||||
plt.show()
|
plt.show()
|
||||||
elif geometry_type == 'contour':
|
elif geometry_type == 'contour':
|
||||||
if not mesh:
|
|
||||||
raise ValueError("Use mesh=True")
|
|
||||||
for img, table_bbox in geometry.images:
|
for img, table_bbox in geometry.images:
|
||||||
for t in table_bbox.keys():
|
for t in table_bbox.keys():
|
||||||
cv2.rectangle(img, (t[0], t[1]),
|
cv2.rectangle(img, (t[0], t[1]),
|
||||||
|
|
@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
plt.show()
|
plt.show()
|
||||||
elif geometry_type == 'joint':
|
elif geometry_type == 'joint':
|
||||||
if not mesh:
|
|
||||||
raise ValueError("Use mesh=True")
|
|
||||||
for img, table_bbox in geometry.images:
|
for img, table_bbox in geometry.images:
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
|
|
@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
|
||||||
plt.imshow(img)
|
plt.imshow(img)
|
||||||
plt.show()
|
plt.show()
|
||||||
elif geometry_type == 'line':
|
elif geometry_type == 'line':
|
||||||
if not mesh:
|
|
||||||
raise ValueError("Use mesh=True")
|
|
||||||
for v_s, h_s in geometry.segments:
|
for v_s, h_s in geometry.segments:
|
||||||
for v in v_s:
|
for v in v_s:
|
||||||
plt.plot([v[0], v[2]], [v[1], v[3]])
|
plt.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
LTTextLineVertical)
|
LTTextLineVertical)
|
||||||
|
|
||||||
|
|
||||||
|
stream_kwargs = [
|
||||||
|
'columns',
|
||||||
|
'row_close_tol',
|
||||||
|
'col_close_tol'
|
||||||
|
]
|
||||||
|
lattice_kwargs = [
|
||||||
|
'process_background',
|
||||||
|
'line_size_scaling',
|
||||||
|
'copy_text',
|
||||||
|
'shift_text',
|
||||||
|
'line_close_tol',
|
||||||
|
'joint_close_tol',
|
||||||
|
'threshold_blocksize',
|
||||||
|
'threshold_constant',
|
||||||
|
'iterations'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_input(kwargs, mesh=False, geometry_type=False):
|
||||||
|
def check_intersection(parser_kwargs, input_kwargs, message_bool):
|
||||||
|
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
|
||||||
|
if isec:
|
||||||
|
raise ValueError("{} can not be used with mesh set to {}".format(
|
||||||
|
",".join(sorted(isec)), message_bool))
|
||||||
|
|
||||||
|
if mesh:
|
||||||
|
check_intersection(stream_kwargs, kwargs, True)
|
||||||
|
else:
|
||||||
|
check_intersection(lattice_kwargs, kwargs, False)
|
||||||
|
if geometry_type:
|
||||||
|
if not mesh and geometry_type in ['contour', 'joint', 'line']:
|
||||||
|
raise ValueError("Use geometry_type={} with mesh set to True".format(
|
||||||
|
geometry_type))
|
||||||
|
|
||||||
|
|
||||||
|
def remove_extra(kwargs, mesh=False):
|
||||||
|
if mesh:
|
||||||
|
for key in kwargs.keys():
|
||||||
|
if key in stream_kwargs:
|
||||||
|
kwargs.pop(key)
|
||||||
|
else:
|
||||||
|
for key in kwargs.keys():
|
||||||
|
if key in lattice_kwargs:
|
||||||
|
kwargs.pop(key)
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/22726782
|
# https://stackoverflow.com/a/22726782
|
||||||
class TemporaryDirectory(object):
|
class TemporaryDirectory(object):
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
|
|
||||||
7
setup.py
7
setup.py
|
|
@ -49,7 +49,12 @@ def setup_package():
|
||||||
author_email=AUTHOR_EMAIL,
|
author_email=AUTHOR_EMAIL,
|
||||||
license=LICENSE,
|
license=LICENSE,
|
||||||
packages=['camelot'],
|
packages=['camelot'],
|
||||||
install_requires=reqs)
|
install_requires=reqs,
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'camelot = camelot.cli:cli',
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue