camelot-py/camelot/cli.py

# -*- coding: utf-8 -*-

import logging

import click
try:
    import matplotlib.pyplot as plt
except ImportError:
    _HAS_MPL = False
else:
    _HAS_MPL = True

from . import __version__, read_pdf, plot


logger = logging.getLogger('camelot')
logger.setLevel(logging.INFO)


class Config(object):
    def __init__(self):
        self.config = {}

    def set_config(self, key, value):
        self.config[key] = value


pass_config = click.make_pass_decorator(Config)


@click.group()
@click.version_option(version=__version__)
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
              ' Example: 1,3,4 or 1,4-end.')
@click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
              type=click.Choice(['csv', 'json', 'excel', 'html']),
              help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
@click.option('-split', '--split_text', is_flag=True,
              help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
              ' font size. Useful to detect super/subscripts.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
              help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context
def cli(ctx, *args, **kwargs):
    """Camelot: PDF Table Extraction for Humans"""
    ctx.obj = Config()
    for key, value in kwargs.items():
        ctx.obj.set_config(key, value)


@cli.command('lattice')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
              help='Process background lines.')
@click.option('-scale', '--line_size_scaling', default=15,
              help='Line size scaling factor. The larger the value,'
              ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
              multiple=True, help='Direction in which text in a spanning cell'
              ' will be copied over.')
@click.option('-shift', '--shift_text', default=['l', 't'],
              type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
              help='Direction in which text in a spanning cell will flow.')
@click.option('-l', '--line_close_tol', default=2,
              help='Tolerance parameter used to merge close vertical'
              ' and horizontal lines.')
@click.option('-j', '--joint_close_tol', default=2,
              help='Tolerance parameter used to decide whether'
              ' the detected lines and points lie close to each other.')
@click.option('-block', '--threshold_blocksize', default=15,
              help='For adaptive thresholding, size of a pixel'
              ' neighborhood that is used to calculate a threshold value for'
              ' the pixel. Example: 3, 5, 7, and so on.')
@click.option('-const', '--threshold_constant', default=-2,
              help='For adaptive thresholding, constant subtracted'
              ' from the mean or weighted mean. Normally, it is positive but'
              ' may be zero or negative as well.')
@click.option('-I', '--iterations', default=0,
              help='Number of times for erosion/dilation will be applied.')
@click.option('-plot', '--plot_type',
              type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']),
              help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@pass_config
def lattice(c, *args, **kwargs):
    """Use lines between text to parse the table."""
    conf = c.config
    pages = conf.pop('pages')
    output = conf.pop('output')
    f = conf.pop('format')
    compress = conf.pop('zip')
    quiet = conf.pop('quiet')
    plot_type = kwargs.pop('plot_type')
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    copy_text = list(kwargs['copy_text'])
    kwargs['copy_text'] = None if not copy_text else copy_text
    kwargs['shift_text'] = list(kwargs['shift_text'])

    if plot_type is not None:
        if not _HAS_MPL:
            raise ImportError('matplotlib is required for plotting.')
    else:
        if output is None:
            raise click.UsageError('Please specify output file path using --output')
        if f is None:
            raise click.UsageError('Please specify output file format using --format')

    tables = read_pdf(filepath, pages=pages, flavor='lattice',
                      suppress_stdout=quiet, **kwargs)
    click.echo('Found {} tables'.format(tables.n))
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
            plt.show()
    else:
        tables.export(output, f=f, compress=compress)


@cli.command('stream')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True,
              help='X coordinates of column separators.')
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
              ' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
              ' used to combine text horizontally, to generate columns.')
@click.option('-plot', '--plot_type',
              type=click.Choice(['text', 'grid', 'contour', 'textedge']),
              help='Plot elements found on PDF page for visual debugging.')
@click.argument('filepath', type=click.Path(exists=True))
@pass_config
def stream(c, *args, **kwargs):
    """Use spaces between text to parse the table."""
    conf = c.config
    pages = conf.pop('pages')
    output = conf.pop('output')
    f = conf.pop('format')
    compress = conf.pop('zip')
    quiet = conf.pop('quiet')
    plot_type = kwargs.pop('plot_type')
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)

    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    columns = list(kwargs['columns'])
    kwargs['columns'] = None if not columns else columns

    if plot_type is not None:
        if not _HAS_MPL:
            raise ImportError('matplotlib is required for plotting.')
    else:
        if output is None:
            raise click.UsageError('Please specify output file path using --output')
        if f is None:
            raise click.UsageError('Please specify output file format using --format')

    tables = read_pdf(filepath, pages=pages, flavor='stream',
                      suppress_stdout=quiet, **kwargs)
    click.echo('Found {} tables'.format(tables.n))
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
            plt.show()
    else:
        tables.export(output, f=f, compress=compress)


@cli.command('examples')
def examples(*arg, **kwargs):
    """Usage example"""
    sample = """
    >>> import camelot
    >>> tables = camelot.read_pdf('foo.pdf')
    >>> tables
    <TableList n=1>
    >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
    >>> tables[0]
    <Table shape=(7, 7)>
    >>> tables[0].parsing_report
    {
        'accuracy': 99.02,
        'whitespace': 12.24,
        'order': 1,
        'page': 1
    }
    >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
    >>> tables[0].df # get a pandas DataFrame!

    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | Cycle | KI (1/km) | Distance (mi) | Percent      |           |            |           |
    |  Name |           |               | Fuel Savings |           |            |           |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    |       |           |               |   Improved   | Decreased | Eliminate  | Decreased |
    |       |           |               |     Speed    |   Accel   |   Stops    |  Idle     |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | 2012_2|   3.30    |     1.3       |     5.9%     |    9.5%   |   29.2%    |  17.4%    |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | 2145_1|   0.68    |    11.2       |     2.4%     |    0.1%   |    9.5%    |   2.7%    |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | 4234_1|   0.59    |    58.7       |     8.5%     |    1.3%   |    8.5%    |   3.3%    |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | 2032_2|   0.17    |    57.8       |     21.7%    |    0.3%   |    2.7%    |   1.2%    |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|
    | 4171_1|   0.07    |   173.9       |     58.1%    |    1.6%   |    2.1%    |   0.5%    |
    |-------|-----------|---------------|--------------|-----------|------------|-----------|

    """
    print(sample)