pull/2/head
Vinayak Mehta 2018-09-23 12:45:01 +05:30
parent 7aaa7b2460
commit 959a252aa3
3 changed files with 124 additions and 82 deletions

View File

@ -5,10 +5,20 @@ import click
from . import __version__ from . import __version__
from .io import read_pdf from .io import read_pdf
from .utils import validate_input, remove_extra
@click.command() class Config(object):
def __init__(self):
self.config = {}
def set_config(self, key, value):
self.config[key] = value
pass_config = click.make_pass_decorator(Config)
@click.group()
@click.version_option(version=__version__) @click.version_option(version=__version__)
@click.option("-p", "--pages", default="1", help="Comma-separated page numbers" @click.option("-p", "--pages", default="1", help="Comma-separated page numbers"
" to parse. Example: 1,3,4 or 1,4-end") " to parse. Example: 1,3,4 or 1,4-end")
@ -18,9 +28,6 @@ from .utils import validate_input, remove_extra
help="Output file format.") help="Output file format.")
@click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP" @click.option("-z", "--zip", is_flag=True, help="Whether or not to create a ZIP"
" archive.") " archive.")
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-split", "--split_text", is_flag=True, help="Whether or not to" @click.option("-split", "--split_text", is_flag=True, help="Whether or not to"
" split text if it spans across multiple cells.") " split text if it spans across multiple cells.")
@click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or" @click.option("-flag", "--flag_size", is_flag=True, help="(inactive) Whether or"
@ -28,86 +35,121 @@ from .utils import validate_input, remove_extra
" super/subscripts)") " super/subscripts)")
@click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1), @click.option("-M", "--margins", nargs=3, default=(1.0, 0.5, 0.1),
help="char_margin, line_margin, word_margin for PDFMiner.") help="char_margin, line_margin, word_margin for PDFMiner.")
@click.option("-G", "--geometry_type", @click.pass_context
def cli(ctx, *args, **kwargs):
ctx.obj = Config()
for key, value in kwargs.iteritems():
ctx.obj.set_config(key, value)
@cli.command('lattice')
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-back", "--process_background", is_flag=True,
help="(with --mesh) Whether or not to process lines that are in"
" background.")
@click.option("-scale", "--line_size_scaling", default=15,
help="(with --mesh) Factor by which the page dimensions will be"
" divided to get smallest length of detected lines.")
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
multiple=True, help="(with --mesh) Specify direction"
" in which text will be copied over in a spanning cell.")
@click.option("-shift", "--shift_text", default=["l", "t"],
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True,
help="(with --mesh) Specify direction in which text in a spanning"
" cell should flow.")
@click.option("-l", "--line_close_tol", default=2,
help="(with --mesh) Tolerance parameter used to merge close vertical"
" lines and close horizontal lines.")
@click.option("-j", "--joint_close_tol", default=2,
help="(with --mesh) Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.")
@click.option("-block", "--threshold_blocksize", default=15,
help="(with --mesh) For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel: 3, 5, 7, and so on.")
@click.option("-const", "--threshold_constant", default=-2,
help="(with --mesh) For adaptive thresholding, constant subtracted"
" from the mean or weighted mean.\nNormally, it is positive but"
" may be zero or negative as well.")
@click.option("-I", "--iterations", default=0,
help="(with --mesh) Number of times for erosion/dilation is"
" applied.")
@click.option("-plot", "--plot_type",
type=click.Choice(["text", "table", "contour", "joint", "line"]), type=click.Choice(["text", "table", "contour", "joint", "line"]),
help="Plot geometry found on pdf page for debugging.\n\n" help="Plot geometry found on PDF page for debugging.")
"text: Plot text objects. (Useful to get table_area and"
" columns coordinates)\ntable: Plot parsed table.\n"
"contour (with --mesh): Plot detected rectangles.\njoint (with --mesh): Plot detected line"
" intersections.\nline (with --mesh): Plot detected lines.")
@click.argument("filepath", type=click.Path(exists=True)) @click.argument("filepath", type=click.Path(exists=True))
def cli(*args, **kwargs): @pass_config
pages = kwargs.pop("pages") def lattice(c, *args, **kwargs):
output = kwargs.pop("output") """Use lines between text to generate table."""
f = kwargs.pop("format") conf = c.config
compress = kwargs.pop("zip") pages = conf.pop("pages")
mesh = kwargs.pop("mesh") output = conf.pop("output")
geometry_type = kwargs.pop("geometry_type") f = conf.pop("format")
compress = conf.pop("zip")
plot_type = kwargs.pop('plot_type')
filepath = kwargs.pop("filepath") filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
tables = read_pdf(filepath, pages=pages, flavor='lattice', **kwargs)
click.echo(tables)
if plot_type is not None:
for table in tables:
table.plot(plot_type)
else:
if output is None:
raise click.UsageError("Please specify output filepath using --output")
if f is None:
raise click.UsageError("Please specify output format using --format")
tables.export(output, f=f, compress=compress)
@cli.command('stream')
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-C", "--columns", default=[], multiple=True,
help="x-coordinates of column separators.")
@click.option("-r", "--row_close_tol", default=2, help="Rows will be"
" formed by combining text vertically within this tolerance.")
@click.option("-c", "--col_close_tol", default=0, help="Columns will"
" be formed by combining text horizontally within this tolerance.")
@click.option("-plot", "--plot_type",
type=click.Choice(["text", "table"]),
help="Plot geometry found on PDF page for debugging.")
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def stream(c, *args, **kwargs):
"""Use spaces between text to generate table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
plot_type = kwargs.pop('plot_type')
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_area = list(kwargs['table_area']) table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area kwargs['table_area'] = None if not table_area else table_area
columns = list(kwargs['columns']) columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns kwargs['columns'] = None if not columns else columns
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
kwargs = remove_extra(kwargs, mesh=mesh) tables = read_pdf(filepath, pages=pages, flavor='stream', **kwargs)
tables = read_pdf(filepath, pages=pages, mesh=mesh, **kwargs)
click.echo(tables) click.echo(tables)
if output is None: if plot_type is not None:
raise click.UsageError("Please specify an output filepath using --output") for table in tables:
if f is None: table.plot(plot_type)
raise click.UsageError("Please specify an output format using --format") else:
tables.export(output, f=f, compress=compress) if output is None:
raise click.UsageError("Please specify output filepath using --output")
if f is None:
@click.option("-T", "--table_area", default=[], multiple=True, raise click.UsageError("Please specify output format using --format")
help="Table areas (x1,y1,x2,y2) to process.\n" tables.export(output, f=f, compress=compress)
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-back", "--process_background", is_flag=True, cls=Mutex,
help="(with --mesh) Whether or not to process lines that are in"
" background.")
@click.option("-scale", "--line_size_scaling", default=15, cls=Mutex,
help="(with --mesh) Factor by which the page dimensions will be"
" divided to get smallest length of detected lines.")
@click.option("-copy", "--copy_text", default=[], type=click.Choice(["h", "v"]),
multiple=True, cls=Mutex, help="(with --mesh) Specify direction"
" in which text will be copied over in a spanning cell.")
@click.option("-shift", "--shift_text", default=["l", "t"],
type=click.Choice(["", "l", "r", "t", "b"]), multiple=True, cls=Mutex,
help="(with --mesh) Specify direction in which text in a spanning"
" cell should flow.")
@click.option("-l", "--line_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to merge close vertical"
" lines and close horizontal lines.")
@click.option("-j", "--joint_close_tol", default=2, cls=Mutex,
help="(with --mesh) Tolerance parameter used to decide whether"
" the detected lines and points lie close to each other.")
@click.option("-block", "--threshold_blocksize", default=15, cls=Mutex,
help="(with --mesh) For adaptive thresholding, size of a pixel"
" neighborhood that is used to calculate a threshold value for"
" the pixel: 3, 5, 7, and so on.")
@click.option("-const", "--threshold_constant", default=-2, cls=Mutex,
help="(with --mesh) For adaptive thresholding, constant subtracted"
" from the mean or weighted mean.\nNormally, it is positive but"
" may be zero or negative as well.")
@click.option("-I", "--iterations", default=0, cls=Mutex,
help="(with --mesh) Number of times for erosion/dilation is"
" applied.")
def lattice(*args, **kwargs):
pass
@click.option("-T", "--table_area", default=[], multiple=True,
help="Table areas (x1,y1,x2,y2) to process.\n"
" x1, y1 -> left-top and x2, y2 -> right-bottom")
@click.option("-C", "--columns", default=[], multiple=True, cls=Mutex,
help="x-coordinates of column separators.")
@click.option("-r", "--row_close_tol", default=2, cls=Mutex, help="Rows will be"
" formed by combining text vertically within this tolerance.")
@click.option("-c", "--col_close_tol", default=0, cls=Mutex, help="Columns will"
" be formed by combining text horizontally within this tolerance.")
def stream(*args, **kwargs):
pass

View File

@ -21,7 +21,7 @@ logger = setup_logging(__name__)
class Lattice(BaseParser): class Lattice(BaseParser):
"""Lattice method of parsing looks for lines between text """Lattice method of parsing looks for lines between text
to form a table. to generate table.
Parameters Parameters
---------- ----------
@ -83,7 +83,7 @@ class Lattice(BaseParser):
line_size_scaling=15, copy_text=None, shift_text=['l', 't'], line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2, split_text=False, flag_size=False, line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, margins=(1.0, 0.5, 0.1)): iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
self.table_area = table_area self.table_area = table_area
self.process_background = process_background self.process_background = process_background
self.line_size_scaling = line_size_scaling self.line_size_scaling = line_size_scaling

View File

@ -16,7 +16,7 @@ logger = setup_logging(__name__)
class Stream(BaseParser): class Stream(BaseParser):
"""Stream method of parsing looks for spaces between text """Stream method of parsing looks for spaces between text
to form a table. to generate table.
If you want to specify columns when specifying multiple table If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal. areas, make sure that the length of both lists are equal.
@ -51,7 +51,7 @@ class Stream(BaseParser):
""" """
def __init__(self, table_area=None, columns=None, split_text=False, def __init__(self, table_area=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0, flag_size=False, row_close_tol=2, col_close_tol=0,
margins=(1.0, 0.5, 0.1)): margins=(1.0, 0.5, 0.1), **kwargs):
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()