#!/usr/bin/env python2 import os import re import csv import sys import glob import time import shutil import logging import zipfile import tempfile from docopt import docopt from werkzeug.utils import secure_filename from PyPDF2 import PdfFileWriter, PdfFileReader from lattice import lattice from stream import stream doc = """ camelot parses tables from PDFs! usage: camelot.py [options] [...] options: -h, --help Show this screen. -v, --version Show version. -p, --pages Comma-separated list of page numbers. Example: -p 1,3-6,10 [default: 1] -f, --format Output format. (csv,xlsx) [default: csv] -l, --log Print log to file. -o, --output Output directory. camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. See 'camelot -h' for more information on a specific method. """ lattice_doc = """ Lattice method looks for lines between data to form a table. usage: camelot.py lattice [options] [--] options: -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -j, --jtol Tolerance to account for when comparing joint and line coordinates. [default: 2] -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] -i, --invert Invert pdf image to make sure that lines are in foreground. -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ stream_doc = """ Stream method looks for spaces between data to form a table. usage: camelot.py stream [options] [--] options: -n, --ncols Number of columns. [default: 0] -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 -M, --cmargin Char margin. Chars closer than cmargin are grouped together to form a word. [default: 2.0] -L, --lmargin Line margin. Lines closer than lmargin are grouped together to form a textbox. [default: 0.5] -W, --wmargin Word margin. Insert blank spaces between chars if distance between words is greater than word margin. [default: 0.1] -d, --debug Debug by visualizing textboxes. """ pno = re.compile(r'\d+') def filesort(filepath): filename = os.path.basename(filepath) num = pno.findall(filename) if len(num) == 2: return (int(num[0]), int(num[1])) else: return (int(num[0]), 0) if __name__ == '__main__': start_time = time.time() tmpdir = tempfile.mkdtemp() args = docopt(doc, version='0.1', options_first=True) argv = [args['']] + args[''] if args[''] == 'lattice': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) if args['--pages']: if args['--pages'] == ['all']: p = args['--pages'] else: p = [] for r in args['--pages'].split(','): if '-' in r: a, b = r.split('-') a, b = int(a), int(b) p.extend([str(i) for i in range(a, b + 1)]) else: p.extend([str(r)]) else: p = ['1'] p = sorted(set(p)) fname = os.path.basename(args['']) fname = secure_filename(fname) fdir = os.path.dirname(args['']) froot, fext = os.path.splitext(fname) if fext.lower() != '.pdf': print "camelot can parse only pdfs right now" shutil.rmtree(tmpdir) sys.exit() logfname = os.path.join(tmpdir, froot + '.log') logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG) shutil.copy(args[''], os.path.join(tmpdir, fname)) print "separating pdf into pages" print if p == ['all']: infile = PdfFileReader(open(os.path.join(tmpdir, fname), 'rb')) for i in range(infile.getNumPages()): p = infile.getPage(i) outfile = PdfFileWriter() outfile.addPage(p) with open(os.path.join(tmpdir, 'pg-%d.pdf' % (i + 1)), 'wb') as f: outfile.write(f) else: for page in p: infile = PdfFileReader(open(os.path.join(tmpdir, fname), 'rb')) p = infile.getPage(int(page) - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(os.path.join(tmpdir, 'pg-%s.pdf' % page), 'wb') as f: outfile.write(f) glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf'))) if args[''] == 'lattice': print "using the lattice method" for g in glob_pdf: g_fname = os.path.basename(g) print "working on", g_fname g_froot, __ = os.path.splitext(g) try: data = lattice(g, f=args['--fill'], s=int(args['--scale']), jtol=int(args['--jtol']), mtol=int(args['--mtol']), invert=args['--invert'], debug=args['--debug']) if data is None: print continue for k in sorted(data.keys()): csvfile = g_froot + '_%s.csv' % k with open(csvfile, 'w') as outfile: writer = csv.writer(outfile) for d in data[k]: writer.writerow([c.encode('utf-8') for c in d]) print "saved as", os.path.basename(csvfile) print except Exception: logging.exception("") print "couldn't parse", g_fname, "see log for more info" print elif args[''] == 'stream': print "using the stream method" for g in glob_pdf: g_fname = os.path.basename(g) print "working on", g_fname g_froot, __ = os.path.splitext(g) try: data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'], char_margin=float(args['--cmargin']), line_margin=float(args['--lmargin']), word_margin=float(args['--wmargin']), debug=args['--debug']) if data is None: print continue csvfile = g_froot + '.csv' with open(csvfile, 'w') as outfile: writer = csv.writer(outfile) for d in data: writer.writerow([c.encode('utf-8') for c in d]) print "saved as", os.path.basename(csvfile) print except Exception: logging.exception("") print "couldn't parse", g_fname, "see log for more info" print if args['--log']: if args['--output']: shutil.copy(logfname, args['--output']) else: shutil.copy(logfname, fdir) if args['--debug'] not in [None, False]: print "See 'camelot -h' for various parameters you can tweak." shutil.rmtree(tmpdir) sys.exit() glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort) if args['--format'] == 'csv': if len(glob_csv) == 1: if args['--output']: shutil.copy(glob_csv[0], args['--output']) else: shutil.copy(glob_csv[0], fdir) else: zipname = froot + '.zip' zippath = os.path.join(tmpdir, zipname) print "zipping 'em up" with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip: for g in glob_csv: myzip.write(g, os.path.join(froot, os.path.basename(g))) if args['--output']: shutil.copy(zippath, args['--output']) else: shutil.copy(zippath, fdir) print elif args['--format'] == 'xlsx': from pyexcel_xlsx import save_data from collections import OrderedDict data = OrderedDict() for c in glob_csv: c_fname = os.path.basename(c) c_froot, __ = os.path.splitext(c) print "adding", c_fname, "to excel file" with open(c, 'r') as csvfile: reader = csv.reader(csvfile) c_froot, __ = os.path.splitext(c_fname) data.update({c_froot: [row for row in reader]}) xlsxname = froot + '.xlsx' xlsxpath = os.path.join(tmpdir, xlsxname) save_data(xlsxpath, data) if args['--output']: shutil.copy(xlsxpath, args['--output']) else: shutil.copy(xlsxpath, fdir) print print "saved as", xlsxname print "cleaning up..." shutil.rmtree(tmpdir) print "finished in", time.time() - start_time, "seconds" logging.info("Time taken for " + fname + ": " + str(time.time() - start_time) + " seconds")