diff --git a/camelot/cli.py b/camelot/cli.py index 822bd44..0704b66 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -5,10 +5,20 @@ import click from . import __version__ from .io import read_pdf -from .utils import validate_input, remove_extra -@click.command() +class Config(object): + def __init__(self): + self.config = {} + + def set_config(self, key, value): + self.config[key] = value + + +pass_config = click.make_pass_decorator(Config) + + +@click.group() @click.version_option(version=__version__) @click.option("-p", "--pages", default="1", help="Comma-separated page numbers" " to parse. Example: 1,3,4 or 1,4-end") @@ -18,9 +28,6 @@ from .utils import validate_input, remove_extra help="Output file format.") @click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP" " archive.") -@click.option("-T", "--table_area", default=[], multiple=True, - help="Table areas (x1,y1,x2,y2) to process.\n" - " x1, y1 -> left-top and x2, y2 -> right-bottom") @click.option("-split", "--split_text", is_flag=True, help="Whether or not to" " split text if it spans across multiple cells.") @click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or" @@ -28,86 +35,121 @@ from .utils import validate_input, remove_extra " super/subscripts)") @click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), help="char_margin, line_margin, word_margin for PDFMiner.") -@click.option("-G", "--geometry_type", +@click.pass_context +def cli(ctx, *args, **kwargs): + ctx.obj = Config() + for key, value in kwargs.iteritems(): + ctx.obj.set_config(key, value) + + +@cli.command('lattice') +@click.option("-T", "--table_area", default=[], multiple=True, + help="Table areas (x1,y1,x2,y2) to process.\n" + " x1, y1 -> left-top and x2, y2 -> right-bottom") +@click.option("-back", "--process_background", is_flag=True, + help="(with --mesh) Whether or not to process lines that are in" + " background.") +@click.option("-scale", "--line_size_scaling", default=15, + help="(with --mesh) Factor by which the page dimensions will be" + " divided to get smallest length of detected lines.") +@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]), + multiple=True, help="(with --mesh) Specify direction" + " in which text will be copied over in a spanning cell.") +@click.option("-shift", "--shift_text", default=["l", "t"], + type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, + help="(with --mesh) Specify direction in which text in a spanning" + " cell should flow.") +@click.option("-l", "--line_close_tol", default=2, + help="(with --mesh) Tolerance parameter used to merge close vertical" + " lines and close horizontal lines.") +@click.option("-j", "--joint_close_tol", default=2, + help="(with --mesh) Tolerance parameter used to decide whether" + " the detected lines and points lie close to each other.") +@click.option("-block", "--threshold_blocksize", default=15, + help="(with --mesh) For adaptive thresholding, size of a pixel" + " neighborhood that is used to calculate a threshold value for" + " the pixel: 3, 5, 7, and so on.") +@click.option("-const", "--threshold_constant", default=-2, + help="(with --mesh) For adaptive thresholding, constant subtracted" + " from the mean or weighted mean.\nNormally, it is positive but" + " may be zero or negative as well.") +@click.option("-I", "--iterations", default=0, + help="(with --mesh) Number of times for erosion/dilation is" + " applied.") +@click.option("-plot", "--plot_type", type=click.Choice(["text", "table", "contour", "joint", "line"]), - help="Plot geometry found on pdf page for debugging.\n\n" - "text: Plot text objects. (Useful to get table_area and" - " columns coordinates)\ntable: Plot parsed table.\n" - "contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line" - " intersections.\nline (with --mesh): Plot detected lines.") + help="Plot geometry found on PDF page for debugging.") @click.argument("filepath", type=click.Path(exists=True)) -def cli(*args, **kwargs): - pages = kwargs.pop("pages") - output = kwargs.pop("output") - f = kwargs.pop("format") - compress = kwargs.pop("zip") - mesh = kwargs.pop("mesh") - geometry_type = kwargs.pop("geometry_type") +@pass_config +def lattice(c, *args, **kwargs): + """Use lines between text to generate table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + plot_type = kwargs.pop('plot_type') filepath = kwargs.pop("filepath") + kwargs.update(conf) + + table_area = list(kwargs['table_area']) + kwargs['table_area'] = None if not table_area else table_area + copy_text = list(kwargs['copy_text']) + kwargs['copy_text'] = None if not copy_text else copy_text + kwargs['shift_text'] = list(kwargs['shift_text']) + + tables = read_pdf(filepath, pages=pages, flavor='lattice', **kwargs) + click.echo(tables) + if plot_type is not None: + for table in tables: + table.plot(plot_type) + else: + if output is None: + raise click.UsageError("Please specify output filepath using --output") + if f is None: + raise click.UsageError("Please specify output format using --format") + tables.export(output, f=f, compress=compress) + + +@cli.command('stream') +@click.option("-T", "--table_area", default=[], multiple=True, + help="Table areas (x1,y1,x2,y2) to process.\n" + " x1, y1 -> left-top and x2, y2 -> right-bottom") +@click.option("-C", "--columns", default=[], multiple=True, + help="x-coordinates of column separators.") +@click.option("-r", "--row_close_tol", default=2, help="Rows will be" + " formed by combining text vertically within this tolerance.") +@click.option("-c", "--col_close_tol", default=0, help="Columns will" + " be formed by combining text horizontally within this tolerance.") +@click.option("-plot", "--plot_type", + type=click.Choice(["text", "table"]), + help="Plot geometry found on PDF page for debugging.") +@click.argument("filepath", type=click.Path(exists=True)) +@pass_config +def stream(c, *args, **kwargs): + """Use spaces between text to generate table.""" + conf = c.config + pages = conf.pop("pages") + output = conf.pop("output") + f = conf.pop("format") + compress = conf.pop("zip") + plot_type = kwargs.pop('plot_type') + filepath = kwargs.pop("filepath") + kwargs.update(conf) table_area = list(kwargs['table_area']) kwargs['table_area'] = None if not table_area else table_area columns = list(kwargs['columns']) kwargs['columns'] = None if not columns else columns - copy_text = list(kwargs['copy_text']) - kwargs['copy_text'] = None if not copy_text else copy_text - kwargs['shift_text'] = list(kwargs['shift_text']) - kwargs = remove_extra(kwargs, mesh=mesh) - tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs) + tables = read_pdf(filepath, pages=pages, flavor='stream', **kwargs) click.echo(tables) - if output is None: - raise click.UsageError("Please specify an output filepath using --output") - if f is None: - raise click.UsageError("Please specify an output format using --format") - tables.export(output, f=f, compress=compress) - - -@click.option("-T", "--table_area", default=[], multiple=True, - help="Table areas (x1,y1,x2,y2) to process.\n" - " x1, y1 -> left-top and x2, y2 -> right-bottom") -@click.option("-back", "--process_background", is_flag=True, cls=Mutex, - help="(with --mesh) Whether or not to process lines that are in" - " background.") -@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex, - help="(with --mesh) Factor by which the page dimensions will be" - " divided to get smallest length of detected lines.") -@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]), - multiple=True, cls=Mutex, help="(with --mesh) Specify direction" - " in which text will be copied over in a spanning cell.") -@click.option("-shift", "--shift_text", default=["l", "t"], - type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex, - help="(with --mesh) Specify direction in which text in a spanning" - " cell should flow.") -@click.option("-l", "--line_close_tol", default=2, cls=Mutex, - help="(with --mesh) Tolerance parameter used to merge close vertical" - " lines and close horizontal lines.") -@click.option("-j", "--joint_close_tol", default=2, cls=Mutex, - help="(with --mesh) Tolerance parameter used to decide whether" - " the detected lines and points lie close to each other.") -@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex, - help="(with --mesh) For adaptive thresholding, size of a pixel" - " neighborhood that is used to calculate a threshold value for" - " the pixel: 3, 5, 7, and so on.") -@click.option("-const", "--threshold_constant", default=-2, cls=Mutex, - help="(with --mesh) For adaptive thresholding, constant subtracted" - " from the mean or weighted mean.\nNormally, it is positive but" - " may be zero or negative as well.") -@click.option("-I", "--iterations", default=0, cls=Mutex, - help="(with --mesh) Number of times for erosion/dilation is" - " applied.") -def lattice(*args, **kwargs): - pass - - -@click.option("-T", "--table_area", default=[], multiple=True, - help="Table areas (x1,y1,x2,y2) to process.\n" - " x1, y1 -> left-top and x2, y2 -> right-bottom") -@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex, - help="x-coordinates of column separators.") -@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be" - " formed by combining text vertically within this tolerance.") -@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will" - " be formed by combining text horizontally within this tolerance.") -def stream(*args, **kwargs): - pass \ No newline at end of file + if plot_type is not None: + for table in tables: + table.plot(plot_type) + else: + if output is None: + raise click.UsageError("Please specify output filepath using --output") + if f is None: + raise click.UsageError("Please specify output format using --format") + tables.export(output, f=f, compress=compress) \ No newline at end of file diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 18ee0fc..4023efc 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -21,7 +21,7 @@ logger = setup_logging(__name__) class Lattice(BaseParser): """Lattice method of parsing looks for lines between text - to form a table. + to generate table. Parameters ---------- @@ -83,7 +83,7 @@ class Lattice(BaseParser): line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, margins=(1.0, 0.5, 0.1)): + iterations=0, margins=(1.0, 0.5, 0.1), **kwargs): self.table_area = table_area self.process_background = process_background self.line_size_scaling = line_size_scaling diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 2fcee66..9e2aca9 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -16,7 +16,7 @@ logger = setup_logging(__name__) class Stream(BaseParser): """Stream method of parsing looks for spaces between text - to form a table. + to generate table. If you want to specify columns when specifying multiple table areas, make sure that the length of both lists are equal. @@ -51,7 +51,7 @@ class Stream(BaseParser): """ def __init__(self, table_area=None, columns=None, split_text=False, flag_size=False, row_close_tol=2, col_close_tol=0, - margins=(1.0, 0.5, 0.1)): + margins=(1.0, 0.5, 0.1), **kwargs): self.table_area = table_area self.columns = columns self._validate_columns()