Fix CLI
parent
7aaa7b2460
commit
959a252aa3
192
camelot/cli.py
192
camelot/cli.py
|
|
@ -5,10 +5,20 @@ import click
|
||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
from .io import read_pdf
|
from .io import read_pdf
|
||||||
from .utils import validate_input, remove_extra
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
class Config(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.config = {}
|
||||||
|
|
||||||
|
def set_config(self, key, value):
|
||||||
|
self.config[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
pass_config = click.make_pass_decorator(Config)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
|
||||||
" to parse. Example: 1,3,4 or 1,4-end")
|
" to parse. Example: 1,3,4 or 1,4-end")
|
||||||
|
|
@ -18,9 +28,6 @@ from .utils import validate_input, remove_extra
|
||||||
help="Output file format.")
|
help="Output file format.")
|
||||||
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
|
||||||
" archive.")
|
" archive.")
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
|
||||||
help="Table areas (x1,y1,x2,y2) to process.\n"
|
|
||||||
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
|
||||||
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
|
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
|
||||||
" split text if it spans across multiple cells.")
|
" split text if it spans across multiple cells.")
|
||||||
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
|
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
|
||||||
|
|
@ -28,86 +35,121 @@ from .utils import validate_input, remove_extra
|
||||||
" super/subscripts)")
|
" super/subscripts)")
|
||||||
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help="char_margin, line_margin, word_margin for PDFMiner.")
|
help="char_margin, line_margin, word_margin for PDFMiner.")
|
||||||
@click.option("-G", "--geometry_type",
|
@click.pass_context
|
||||||
|
def cli(ctx, *args, **kwargs):
|
||||||
|
ctx.obj = Config()
|
||||||
|
for key, value in kwargs.iteritems():
|
||||||
|
ctx.obj.set_config(key, value)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('lattice')
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
@click.option("-back", "--process_background", is_flag=True,
|
||||||
|
help="(with --mesh) Whether or not to process lines that are in"
|
||||||
|
" background.")
|
||||||
|
@click.option("-scale", "--line_size_scaling", default=15,
|
||||||
|
help="(with --mesh) Factor by which the page dimensions will be"
|
||||||
|
" divided to get smallest length of detected lines.")
|
||||||
|
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
|
||||||
|
multiple=True, help="(with --mesh) Specify direction"
|
||||||
|
" in which text will be copied over in a spanning cell.")
|
||||||
|
@click.option("-shift", "--shift_text", default=["l", "t"],
|
||||||
|
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True,
|
||||||
|
help="(with --mesh) Specify direction in which text in a spanning"
|
||||||
|
" cell should flow.")
|
||||||
|
@click.option("-l", "--line_close_tol", default=2,
|
||||||
|
help="(with --mesh) Tolerance parameter used to merge close vertical"
|
||||||
|
" lines and close horizontal lines.")
|
||||||
|
@click.option("-j", "--joint_close_tol", default=2,
|
||||||
|
help="(with --mesh) Tolerance parameter used to decide whether"
|
||||||
|
" the detected lines and points lie close to each other.")
|
||||||
|
@click.option("-block", "--threshold_blocksize", default=15,
|
||||||
|
help="(with --mesh) For adaptive thresholding, size of a pixel"
|
||||||
|
" neighborhood that is used to calculate a threshold value for"
|
||||||
|
" the pixel: 3, 5, 7, and so on.")
|
||||||
|
@click.option("-const", "--threshold_constant", default=-2,
|
||||||
|
help="(with --mesh) For adaptive thresholding, constant subtracted"
|
||||||
|
" from the mean or weighted mean.\nNormally, it is positive but"
|
||||||
|
" may be zero or negative as well.")
|
||||||
|
@click.option("-I", "--iterations", default=0,
|
||||||
|
help="(with --mesh) Number of times for erosion/dilation is"
|
||||||
|
" applied.")
|
||||||
|
@click.option("-plot", "--plot_type",
|
||||||
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
type=click.Choice(["text", "table", "contour", "joint", "line"]),
|
||||||
help="Plot geometry found on pdf page for debugging.\n\n"
|
help="Plot geometry found on PDF page for debugging.")
|
||||||
"text: Plot text objects. (Useful to get table_area and"
|
|
||||||
" columns coordinates)\ntable: Plot parsed table.\n"
|
|
||||||
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
|
|
||||||
" intersections.\nline (with --mesh): Plot detected lines.")
|
|
||||||
@click.argument("filepath", type=click.Path(exists=True))
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
def cli(*args, **kwargs):
|
@pass_config
|
||||||
pages = kwargs.pop("pages")
|
def lattice(c, *args, **kwargs):
|
||||||
output = kwargs.pop("output")
|
"""Use lines between text to generate table."""
|
||||||
f = kwargs.pop("format")
|
conf = c.config
|
||||||
compress = kwargs.pop("zip")
|
pages = conf.pop("pages")
|
||||||
mesh = kwargs.pop("mesh")
|
output = conf.pop("output")
|
||||||
geometry_type = kwargs.pop("geometry_type")
|
f = conf.pop("format")
|
||||||
|
compress = conf.pop("zip")
|
||||||
|
plot_type = kwargs.pop('plot_type')
|
||||||
filepath = kwargs.pop("filepath")
|
filepath = kwargs.pop("filepath")
|
||||||
|
kwargs.update(conf)
|
||||||
|
|
||||||
|
table_area = list(kwargs['table_area'])
|
||||||
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
|
copy_text = list(kwargs['copy_text'])
|
||||||
|
kwargs['copy_text'] = None if not copy_text else copy_text
|
||||||
|
kwargs['shift_text'] = list(kwargs['shift_text'])
|
||||||
|
|
||||||
|
tables = read_pdf(filepath, pages=pages, flavor='lattice', **kwargs)
|
||||||
|
click.echo(tables)
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
table.plot(plot_type)
|
||||||
|
else:
|
||||||
|
if output is None:
|
||||||
|
raise click.UsageError("Please specify output filepath using --output")
|
||||||
|
if f is None:
|
||||||
|
raise click.UsageError("Please specify output format using --format")
|
||||||
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command('stream')
|
||||||
|
@click.option("-T", "--table_area", default=[], multiple=True,
|
||||||
|
help="Table areas (x1,y1,x2,y2) to process.\n"
|
||||||
|
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
||||||
|
@click.option("-C", "--columns", default=[], multiple=True,
|
||||||
|
help="x-coordinates of column separators.")
|
||||||
|
@click.option("-r", "--row_close_tol", default=2, help="Rows will be"
|
||||||
|
" formed by combining text vertically within this tolerance.")
|
||||||
|
@click.option("-c", "--col_close_tol", default=0, help="Columns will"
|
||||||
|
" be formed by combining text horizontally within this tolerance.")
|
||||||
|
@click.option("-plot", "--plot_type",
|
||||||
|
type=click.Choice(["text", "table"]),
|
||||||
|
help="Plot geometry found on PDF page for debugging.")
|
||||||
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
|
@pass_config
|
||||||
|
def stream(c, *args, **kwargs):
|
||||||
|
"""Use spaces between text to generate table."""
|
||||||
|
conf = c.config
|
||||||
|
pages = conf.pop("pages")
|
||||||
|
output = conf.pop("output")
|
||||||
|
f = conf.pop("format")
|
||||||
|
compress = conf.pop("zip")
|
||||||
|
plot_type = kwargs.pop('plot_type')
|
||||||
|
filepath = kwargs.pop("filepath")
|
||||||
|
kwargs.update(conf)
|
||||||
|
|
||||||
table_area = list(kwargs['table_area'])
|
table_area = list(kwargs['table_area'])
|
||||||
kwargs['table_area'] = None if not table_area else table_area
|
kwargs['table_area'] = None if not table_area else table_area
|
||||||
columns = list(kwargs['columns'])
|
columns = list(kwargs['columns'])
|
||||||
kwargs['columns'] = None if not columns else columns
|
kwargs['columns'] = None if not columns else columns
|
||||||
copy_text = list(kwargs['copy_text'])
|
|
||||||
kwargs['copy_text'] = None if not copy_text else copy_text
|
|
||||||
kwargs['shift_text'] = list(kwargs['shift_text'])
|
|
||||||
|
|
||||||
kwargs = remove_extra(kwargs, mesh=mesh)
|
tables = read_pdf(filepath, pages=pages, flavor='stream', **kwargs)
|
||||||
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
|
|
||||||
click.echo(tables)
|
click.echo(tables)
|
||||||
|
if plot_type is not None:
|
||||||
|
for table in tables:
|
||||||
|
table.plot(plot_type)
|
||||||
|
else:
|
||||||
if output is None:
|
if output is None:
|
||||||
raise click.UsageError("Please specify an output filepath using --output")
|
raise click.UsageError("Please specify output filepath using --output")
|
||||||
if f is None:
|
if f is None:
|
||||||
raise click.UsageError("Please specify an output format using --format")
|
raise click.UsageError("Please specify output format using --format")
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
|
||||||
help="Table areas (x1,y1,x2,y2) to process.\n"
|
|
||||||
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
|
||||||
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
|
|
||||||
help="(with --mesh) Whether or not to process lines that are in"
|
|
||||||
" background.")
|
|
||||||
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
|
|
||||||
help="(with --mesh) Factor by which the page dimensions will be"
|
|
||||||
" divided to get smallest length of detected lines.")
|
|
||||||
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
|
|
||||||
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
|
|
||||||
" in which text will be copied over in a spanning cell.")
|
|
||||||
@click.option("-shift", "--shift_text", default=["l", "t"],
|
|
||||||
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex,
|
|
||||||
help="(with --mesh) Specify direction in which text in a spanning"
|
|
||||||
" cell should flow.")
|
|
||||||
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
|
|
||||||
help="(with --mesh) Tolerance parameter used to merge close vertical"
|
|
||||||
" lines and close horizontal lines.")
|
|
||||||
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
|
|
||||||
help="(with --mesh) Tolerance parameter used to decide whether"
|
|
||||||
" the detected lines and points lie close to each other.")
|
|
||||||
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
|
|
||||||
help="(with --mesh) For adaptive thresholding, size of a pixel"
|
|
||||||
" neighborhood that is used to calculate a threshold value for"
|
|
||||||
" the pixel: 3, 5, 7, and so on.")
|
|
||||||
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
|
|
||||||
help="(with --mesh) For adaptive thresholding, constant subtracted"
|
|
||||||
" from the mean or weighted mean.\nNormally, it is positive but"
|
|
||||||
" may be zero or negative as well.")
|
|
||||||
@click.option("-I", "--iterations", default=0, cls=Mutex,
|
|
||||||
help="(with --mesh) Number of times for erosion/dilation is"
|
|
||||||
" applied.")
|
|
||||||
def lattice(*args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@click.option("-T", "--table_area", default=[], multiple=True,
|
|
||||||
help="Table areas (x1,y1,x2,y2) to process.\n"
|
|
||||||
" x1, y1 -> left-top and x2, y2 -> right-bottom")
|
|
||||||
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
|
|
||||||
help="x-coordinates of column separators.")
|
|
||||||
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
|
|
||||||
" formed by combining text vertically within this tolerance.")
|
|
||||||
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
|
|
||||||
" be formed by combining text horizontally within this tolerance.")
|
|
||||||
def stream(*args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
@ -21,7 +21,7 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Lattice(BaseParser):
|
class Lattice(BaseParser):
|
||||||
"""Lattice method of parsing looks for lines between text
|
"""Lattice method of parsing looks for lines between text
|
||||||
to form a table.
|
to generate table.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -83,7 +83,7 @@ class Lattice(BaseParser):
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, margins=(1.0, 0.5, 0.1)):
|
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_size_scaling = line_size_scaling
|
self.line_size_scaling = line_size_scaling
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ logger = setup_logging(__name__)
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
"""Stream method of parsing looks for spaces between text
|
"""Stream method of parsing looks for spaces between text
|
||||||
to form a table.
|
to generate table.
|
||||||
|
|
||||||
If you want to specify columns when specifying multiple table
|
If you want to specify columns when specifying multiple table
|
||||||
areas, make sure that the length of both lists are equal.
|
areas, make sure that the length of both lists are equal.
|
||||||
|
|
@ -51,7 +51,7 @@ class Stream(BaseParser):
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, split_text=False,
|
def __init__(self, table_area=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0,
|
flag_size=False, row_close_tol=2, col_close_tol=0,
|
||||||
margins=(1.0, 0.5, 0.1)):
|
margins=(1.0, 0.5, 0.1), **kwargs):
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue