#!/usr/bin/env python2 from __future__ import print_function import os import sys import time import logging from docopt import docopt from PyPDF2 import PdfFileReader from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream doc = """ Camelot: PDF parsing made simpler! usage: camelot [options] [...] options: -h, --help Show this screen. -v, --version Show version. -p, --pages Comma-separated list of page numbers. Example: -p 1,3-6,10 [default: 1] -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] -l, --log Print log to file. -V, --verbose Verbose. -o, --output Output directory. camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. See 'camelot -h' for more information on a specific method. """ lattice_doc = """ Lattice method looks for lines between data to form a table. usage: camelot lattice [options] [--] options: -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -j, --jtol Tolerance to account for when comparing joint and line coordinates. [default: 2] -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] -i, --invert Invert pdf image to make sure that lines are in foreground. -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ stream_doc = """ Stream method looks for spaces between data to form a table. usage: camelot stream [options] [--] options: -n, --ncols Number of columns. [default: 0] -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 -y, --ytol Tolerance to account for when grouping rows together. [default: 2] -M, --cmargin Char margin. Chars closer than cmargin are grouped together to form a word. [default: 2.0] -L, --lmargin Line margin. Lines closer than lmargin are grouped together to form a textbox. [default: 0.5] -W, --wmargin Word margin. Insert blank spaces between chars if distance between words is greater than word margin. [default: 0.1] -d, --debug Debug by visualizing textboxes. """ def convert_to_html(table): html = '' html = ''.join([html, '\n']) for row in table: html = ''.join([html, ' \n']) for data in row: html = ''.join([html, ' \n']) html = ''.join([html, ' \n']) html = ''.join([html, '
', data, '
\n']) return html def write_to_disk(data, f='csv', output=None, filename=None): # raise something if filename and/or output are None fname = os.path.basename(filename) froot, __ = os.path.splitext(fname) if f in ['csv', 'tsv']: import csv delimiter = ',' if f == 'csv' else '\t' for page in sorted(data): for table in range(len(data[page])): dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f) with open(os.path.join(output, dsvname), 'w') as outfile: writer = csv.writer( outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) for row in data[page][table]: writer.writerow(row) elif f == 'html': htmlname = '{}.html'.format(froot) for page in sorted(data): for table in range(len(data[page])): with open(os.path.join(output, htmlname), 'a') as htmlfile: htmlfile.write(convert_to_html(data[page][table])) elif f == 'json': import json with open(os.path.join(output, '{}.json'.format(froot)), 'w') \ as jsonfile: json.dump(data, jsonfile) elif f == 'xlsx': try: from pyexcel_xlsx import save_data from collections import OrderedDict xlsx_data = OrderedDict() for page in sorted(data): for table in range(len(data[page])): sheet_name = '{0}_table_{1}'.format(page, table + 1) xlsx_data.update({sheet_name: [row for row in data[page][table]]}) save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data) except ImportError: print("link to install docs") if __name__ == '__main__': start_time = time.time() args = docopt(doc, version='0.1', options_first=True) argv = [args['']] + args[''] if args[''] == 'lattice': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) vprint = print if args['--verbose'] else lambda *a, **k: None filename = args[''] filedir = os.path.dirname(args['']) logname, __ = os.path.splitext(filename) logname += '.log' if args['--log']: if args['--output']: logname = os.path.join(args['--output'], os.path.basename(logname)) logging.basicConfig( filename=logname, filemode='w', level=logging.DEBUG) else: logging.basicConfig( filename=logname, filemode='w', level=logging.DEBUG) p = [] if args['--pages'] == '1': p.append({'start': 1, 'end': 1}) else: if args['--pages'] == 'all': infile = PdfFileReader(open(filename, 'rb'), strict=False) p.append({'start': 1, 'end': infile.getNumPages()}) else: for r in args['--pages'].split(','): if '-' in r: a, b = r.split('-') p.append({'start': int(a), 'end': int(b)}) else: p.append({'start': int(r), 'end': int(r)}) if args[''] == 'lattice': try: extractor = Lattice(Pdf(filename, pagenos=p, clean=True), fill=args['--fill'], scale=int(args['--scale']), jtol=int(args['--jtol']), mtol=int(args['--mtol']), invert=args['--invert'], debug=args['--debug'], verbose=args['--verbose']) data = extractor.get_tables() if args['--debug']: extractor.plot_geometry(args['--debug']) except Exception as e: logging.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'stream': try: extractor = Stream(Pdf(filename, pagenos=p, char_margin=float(args['--cmargin']), line_margin=float(args['--lmargin']), word_margin=float(args['--wmargin']), clean=True), ncolumns=int(args['--ncols']), columns=args['--columns'], ytol=int(args['--ytol']), debug=args['--debug'], verbose=args['--verbose']) data = extractor.get_tables() if args['--debug']: extractor.plot_text() except Exception as e: logging.exception(e.message, exc_info=True) sys.exit() if data is None: print("See 'camelot -h' for various parameters you can tweak.") else: output = filedir if args['--output'] is None else args['--output'] write_to_disk(data, f=args['--format'], output=output, filename=filename) vprint("finished in", time.time() - start_time, "seconds") logging.info("Time taken: " + str(time.time() - start_time) + " seconds")