pull/2/head
Vinayak Mehta 2018-09-10 15:16:41 +05:30
parent fcef880e6c
commit 7bb1aee9b6
5 changed files with 146 additions and 21 deletions

View File

@ -1 +1,84 @@
# -*- coding: utf-8 -*-
from pprint import pprint
import click import click
from .io import read_pdf
from .plotting import plot_geometry
from .utils import validate_input, remove_extra
class Mutex(click.Option):
def handle_parse_result(self, ctx, opts, args):
mesh = opts.get('mesh', False)
geometry_type = opts.get('geometry_type', False)
validate_input(opts, mesh=mesh, geometry_type=geometry_type)
return super(Mutex, self).handle_parse_result(ctx, opts, args)
@click.command()
@click.option("-p", "--pages", default="1", help="")
@click.option("-o", "--output", help="")
@click.option("-f", "--format",
type=click.Choice(["csv", "json", "excel", "html"]), help="")
@click.option("-z", "--zip", is_flag=True, help="")
@click.option("-m", "--mesh", is_flag=True, help="Whether or not to"
"use Lattice method of parsing. Stream is used by default.")
@click.option("-G", "--geometry_type",
type=click.Choice(["text", "table", "contour", "joint", "line"]),
help="Plot geometry found on pdf page for debugging.")
@click.option("-T", "--table_area", default=[], multiple=True,
help="")
@click.option("-split", "--split_text", is_flag=True, help="")
@click.option("-flag", "--flag_size", is_flag=True, help="")
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
help="")
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
help="")
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="")
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="")
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
help="Use with --mesh")
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
help="Use with --mesh")
@click.option("-copy", "--copy_text", default=[], cls=Mutex,
help="Use with --mesh")
@click.option("-shift", "--shift_text", default=["l", "t"], cls=Mutex,
help="Use with --mesh")
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
help="Use with --mesh")
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
help="Use with --mesh")
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
help="Use with --mesh")
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
help="Use with --mesh")
@click.option("-I", "--iterations", default=0, cls=Mutex,
help="Use with --mesh")
@click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs):
pages = kwargs.pop("pages")
output = kwargs.pop("output")
f = kwargs.pop("format")
compress = kwargs.pop("zip")
mesh = kwargs.pop("mesh")
geometry_type = kwargs.pop("geometry_type")
filepath = kwargs.pop("filepath")
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns
kwargs = remove_extra(kwargs, mesh=mesh)
if geometry_type is None:
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
click.echo(tables)
if output is None:
raise click.UsageError("Please specify an output filepath using --output")
if f is None:
raise click.UsageError("Please specify an output format using --format")
tables.export(output, f=f, compress=compress)
else:
plot_geometry(filepath, pages=pages, mesh=mesh,
geometry_type=geometry_type, **kwargs)

View File

@ -1,4 +1,5 @@
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', mesh=False, **kwargs): def read_pdf(filepath, pages='1', mesh=False, **kwargs):
@ -78,17 +79,14 @@ def read_pdf(filepath, pages='1', mesh=False, **kwargs):
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
Returns Returns
------- -------
tables : camelot.core.TableList tables : camelot.core.TableList
""" """
# validate kwargs? validate_input(kwargs, mesh=mesh)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
kwargs = remove_extra(kwargs, mesh=mesh)
tables, __ = p.parse(mesh=mesh, **kwargs) tables, __ = p.parse(mesh=mesh, **kwargs)
return tables return tables

View File

@ -3,9 +3,10 @@ import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs): def plot_geometry(filepath, pages='1', mesh=False, geometry_type=None, **kwargs):
"""Plot geometry found on pdf page based on type specified, """Plot geometry found on pdf page based on type specified,
useful for debugging and playing with different parameters to get useful for debugging and playing with different parameters to get
the best output. the best output.
@ -23,7 +24,7 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
mesh : bool (default: False) mesh : bool (default: False)
Whether or not to use Lattice method of parsing. Stream Whether or not to use Lattice method of parsing. Stream
is used by default. is used by default.
geometry_type : str, optional (default: 'text') geometry_type : str, optional (default: None)
'text' : Plot text objects found on page, useful to get 'text' : Plot text objects found on page, useful to get
table_area and columns coordinates. table_area and columns coordinates.
'table' : Plot parsed table. 'table' : Plot parsed table.
@ -91,15 +92,12 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
PDFMiner margins. (char_margin, line_margin, word_margin) PDFMiner margins. (char_margin, line_margin, word_margin)
For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_. For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
debug : bool, optional (default: False)
Whether or not to return all text objects on the page
which can be used to generate a matplotlib plot, to get
values for table_area(s) and debugging.
""" """
# validate kwargs? validate_input(kwargs, mesh=mesh, geometry_type=geometry_type)
p = PDFHandler(filepath, pages) p = PDFHandler(filepath, pages)
debug = True if geometry_type else False kwargs = remove_extra(kwargs, mesh=mesh)
debug = True if geometry_type is not None else False
kwargs.update({'debug': debug}) kwargs.update({'debug': debug})
__, geometry = p.parse(mesh=mesh, **kwargs) __, geometry = p.parse(mesh=mesh, **kwargs)
@ -140,8 +138,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
[cell.lb[1], cell.rb[1]]) [cell.lb[1], cell.rb[1]])
plt.show() plt.show()
elif geometry_type == 'contour': elif geometry_type == 'contour':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images: for img, table_bbox in geometry.images:
for t in table_bbox.keys(): for t in table_bbox.keys():
cv2.rectangle(img, (t[0], t[1]), cv2.rectangle(img, (t[0], t[1]),
@ -149,8 +145,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
elif geometry_type == 'joint': elif geometry_type == 'joint':
if not mesh:
raise ValueError("Use mesh=True")
for img, table_bbox in geometry.images: for img, table_bbox in geometry.images:
x_coord = [] x_coord = []
y_coord = [] y_coord = []
@ -164,8 +158,6 @@ def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwarg
plt.imshow(img) plt.imshow(img)
plt.show() plt.show()
elif geometry_type == 'line': elif geometry_type == 'line':
if not mesh:
raise ValueError("Use mesh=True")
for v_s, h_s in geometry.segments: for v_s, h_s in geometry.segments:
for v in v_s: for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]]) plt.plot([v[0], v[2]], [v[1], v[3]])

View File

@ -20,6 +20,53 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical)
stream_kwargs = [
'columns',
'row_close_tol',
'col_close_tol'
]
lattice_kwargs = [
'process_background',
'line_size_scaling',
'copy_text',
'shift_text',
'line_close_tol',
'joint_close_tol',
'threshold_blocksize',
'threshold_constant',
'iterations'
]
def validate_input(kwargs, mesh=False, geometry_type=False):
def check_intersection(parser_kwargs, input_kwargs, message_bool):
isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
if isec:
raise ValueError("{} can not be used with mesh set to {}".format(
",".join(sorted(isec)), message_bool))
if mesh:
check_intersection(stream_kwargs, kwargs, True)
else:
check_intersection(lattice_kwargs, kwargs, False)
if geometry_type:
if not mesh and geometry_type in ['contour', 'joint', 'line']:
raise ValueError("Use geometry_type={} with mesh set to True".format(
geometry_type))
def remove_extra(kwargs, mesh=False):
if mesh:
for key in kwargs.keys():
if key in stream_kwargs:
kwargs.pop(key)
else:
for key in kwargs.keys():
if key in lattice_kwargs:
kwargs.pop(key)
return kwargs
# https://stackoverflow.com/a/22726782 # https://stackoverflow.com/a/22726782
class TemporaryDirectory(object): class TemporaryDirectory(object):
def __enter__(self): def __enter__(self):

View File

@ -49,7 +49,12 @@ def setup_package():
author_email=AUTHOR_EMAIL, author_email=AUTHOR_EMAIL,
license=LICENSE, license=LICENSE,
packages=['camelot'], packages=['camelot'],
install_requires=reqs) install_requires=reqs,
entry_points={
'console_scripts': [
'camelot = camelot.cli:cli',
],
})
try: try:
from setuptools import setup from setuptools import setup