# -*- coding: utf-8 -*- import logging import click try: import matplotlib.pyplot as plt except ImportError: _HAS_MPL = False else: _HAS_MPL = True from . import __version__, read_pdf, plot logger = logging.getLogger('camelot') logger.setLevel(logging.INFO) class Config(object): def __init__(self): self.config = {} def set_config(self, key, value): self.config[key] = value pass_config = click.make_pass_decorator(Config) @click.group() @click.version_option(version=__version__) @click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.') @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' ' Example: 1,3,4 or 1,4-end.') @click.option('-pw', '--password', help='Password for decryption.') @click.option('-o', '--output', help='Output file path.') @click.option('-f', '--format', type=click.Choice(['csv', 'json', 'excel', 'html']), help='Output file format.') @click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') @click.option('-split', '--split_text', is_flag=True, help='Split text that spans across multiple cells.') @click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' ' font size. Useful to detect super/subscripts.') @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), help='PDFMiner char_margin, line_margin and word_margin.') @click.pass_context def cli(ctx, *args, **kwargs): """Camelot: PDF Table Extraction for Humans""" ctx.obj = Config() for key, value in kwargs.items(): ctx.obj.set_config(key, value) @cli.command('lattice') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-back', '--process_background', is_flag=True, help='Process background lines.') @click.option('-scale', '--line_size_scaling', default=15, help='Line size scaling factor. The larger the value,' ' the smaller the detected lines.') @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), multiple=True, help='Direction in which text in a spanning cell' ' will be copied over.') @click.option('-shift', '--shift_text', default=['l', 't'], type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, help='Direction in which text in a spanning cell will flow.') @click.option('-l', '--line_close_tol', default=2, help='Tolerance parameter used to merge close vertical' ' and horizontal lines.') @click.option('-j', '--joint_close_tol', default=2, help='Tolerance parameter used to decide whether' ' the detected lines and points lie close to each other.') @click.option('-block', '--threshold_blocksize', default=15, help='For adaptive thresholding, size of a pixel' ' neighborhood that is used to calculate a threshold value for' ' the pixel. Example: 3, 5, 7, and so on.') @click.option('-const', '--threshold_constant', default=-2, help='For adaptive thresholding, constant subtracted' ' from the mean or weighted mean. Normally, it is positive but' ' may be zero or negative as well.') @click.option('-I', '--iterations', default=0, help='Number of times for erosion/dilation will be applied.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'grid', 'contour', 'joint', 'line']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def lattice(c, *args, **kwargs): """Use lines between text to parse the table.""" conf = c.config pages = conf.pop('pages') output = conf.pop('output') f = conf.pop('format') compress = conf.pop('zip') quiet = conf.pop('quiet') plot_type = kwargs.pop('plot_type') filepath = kwargs.pop('filepath') kwargs.update(conf) table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas copy_text = list(kwargs['copy_text']) kwargs['copy_text'] = None if not copy_text else copy_text kwargs['shift_text'] = list(kwargs['shift_text']) if plot_type is not None: if not _HAS_MPL: raise ImportError('matplotlib is required for plotting.') else: if output is None: raise click.UsageError('Please specify output file path using --output') if f is None: raise click.UsageError('Please specify output file format using --format') tables = read_pdf(filepath, pages=pages, flavor='lattice', suppress_stdout=quiet, **kwargs) click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: plot(table, kind=plot_type) plt.show() else: tables.export(output, f=f, compress=compress) @cli.command('stream') @click.option('-T', '--table_areas', default=[], multiple=True, help='Table areas to process. Example: x1,y1,x2,y2' ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-C', '--columns', default=[], multiple=True, help='X coordinates of column separators.') @click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' ' used to combine text vertically, to generate rows.') @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' ' used to combine text horizontally, to generate columns.') @click.option('-plot', '--plot_type', type=click.Choice(['text', 'grid', 'contour', 'textedge']), help='Plot elements found on PDF page for visual debugging.') @click.argument('filepath', type=click.Path(exists=True)) @pass_config def stream(c, *args, **kwargs): """Use spaces between text to parse the table.""" conf = c.config pages = conf.pop('pages') output = conf.pop('output') f = conf.pop('format') compress = conf.pop('zip') quiet = conf.pop('quiet') plot_type = kwargs.pop('plot_type') filepath = kwargs.pop('filepath') kwargs.update(conf) table_areas = list(kwargs['table_areas']) kwargs['table_areas'] = None if not table_areas else table_areas columns = list(kwargs['columns']) kwargs['columns'] = None if not columns else columns if plot_type is not None: if not _HAS_MPL: raise ImportError('matplotlib is required for plotting.') else: if output is None: raise click.UsageError('Please specify output file path using --output') if f is None: raise click.UsageError('Please specify output file format using --format') tables = read_pdf(filepath, pages=pages, flavor='stream', suppress_stdout=quiet, **kwargs) click.echo('Found {} tables'.format(tables.n)) if plot_type is not None: for table in tables: plot(table, kind=plot_type) plt.show() else: tables.export(output, f=f, compress=compress) @cli.command('examples') def examples(*arg, **kwargs): """Usage example""" sample = """ >>> import camelot >>> tables = camelot.read_pdf('foo.pdf') >>> tables >>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html >>> tables[0] >>> tables[0].parsing_report { 'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1 } >>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html >>> tables[0].df # get a pandas DataFrame! |-------|-----------|---------------|--------------|-----------|------------|-----------| | Cycle | KI (1/km) | Distance (mi) | Percent | | | | | Name | | | Fuel Savings | | | | |-------|-----------|---------------|--------------|-----------|------------|-----------| | | | | Improved | Decreased | Eliminate | Decreased | | | | | Speed | Accel | Stops | Idle | |-------|-----------|---------------|--------------|-----------|------------|-----------| | 2012_2| 3.30 | 1.3 | 5.9% | 9.5% | 29.2% | 17.4% | |-------|-----------|---------------|--------------|-----------|------------|-----------| | 2145_1| 0.68 | 11.2 | 2.4% | 0.1% | 9.5% | 2.7% | |-------|-----------|---------------|--------------|-----------|------------|-----------| | 4234_1| 0.59 | 58.7 | 8.5% | 1.3% | 8.5% | 3.3% | |-------|-----------|---------------|--------------|-----------|------------|-----------| | 2032_2| 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | |-------|-----------|---------------|--------------|-----------|------------|-----------| | 4171_1| 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | |-------|-----------|---------------|--------------|-----------|------------|-----------| """ print(sample)