#!/usr/bin/env python2 import os import re import csv import sys import glob import time import shutil import logging import zipfile import tempfile import subprocess from docopt import docopt from werkzeug.utils import secure_filename from lattice import lattice from stream import stream doc = """ camelot parses tables from PDFs! usage: camelot.py [options] [...] options: -h, --help Show this screen. -v, --version Show version. -p, --pages Comma-separated list of page numbers. Example: -p 1,3-6,10 [default: 1] -f, --format Output format. (csv,xlsx) [default: csv] -l, --log Print log to file. -o, --output Output directory. camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. See 'camelot -h' for more information on a specific method. """ lattice_doc = """ Lattice method looks for lines between data to form a table. usage: camelot.py lattice [options] [--] options: -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -j, --jtol Tolerance to account for when comparing joint and line coordinates. [default: 2] -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] -i, --invert Invert pdf image to make sure that lines are in foreground. -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ stream_doc = """ Stream method looks for spaces between data to form a table. usage: camelot.py stream [options] [--] options: -n, --ncols Number of columns. [default: 0] -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 -M, --cmargin Char margin. Chars closer than cmargin are grouped together to form a word. [default: 2.0] -L, --lmargin Line margin. Lines closer than lmargin are grouped together to form a textbox. [default: 0.5] -W, --wmargin Word margin. Insert blank spaces between chars if distance between words is greater than word margin. [default: 0.1] -d, --debug Debug by visualizing textboxes. """ pno = re.compile(r'\d+') def filesort(filepath): filename = os.path.basename(filepath) num = pno.findall(filename) if len(num) == 2: return (int(num[0]), int(num[1])) else: return (int(num[0]), 0) if __name__ == '__main__': start_time = time.time() tmpdir = tempfile.mkdtemp() args = docopt(doc, version='0.1', options_first=True) argv = [args['']] + args[''] if args[''] == 'lattice': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) if args['--pages']: if args['--pages'] == ['all']: p = args['--pages'] else: p = [] for r in args['--pages'].split(','): if '-' in r: a, b = r.split('-') a, b = int(a), int(b) p.extend([str(i) for i in range(a, b + 1)]) else: p.extend([str(r)]) else: p = ['1'] p = sorted(set(p)) fname = os.path.basename(args['']) fname = secure_filename(fname) fdir = os.path.dirname(args['']) froot, fext = os.path.splitext(fname) if fext.lower() != '.pdf': print "camelot can parse only pdfs right now" shutil.rmtree(tmpdir) sys.exit() logfname = os.path.join(tmpdir, froot + '.log') logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG) shutil.copy(args[''], os.path.join(tmpdir, fname)) print "separating pdf into pages" print if p == ['all']: subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir, 'pg-%d.pdf')]) else: for page in p: subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname), os.path.join(tmpdir, 'pg-%s.pdf' % page)]) glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf'))) if args[''] == 'lattice': print "using the lattice method" for g in glob_pdf: g_fname = os.path.basename(g) g_froot, __ = os.path.splitext(g) print "converting %s to image" % g_fname os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g_froot + '.png'])) try: data = lattice(g, f=args['--fill'], s=int(args['--scale']), jtol=int(args['--jtol']), mtol=int(args['--mtol']), invert=args['--invert'], debug=args['--debug']) if data is None: print continue for k in sorted(data.keys()): csvfile = g_froot + '_%s.csv' % k with open(csvfile, 'w') as outfile: writer = csv.writer(outfile) for d in data[k]: writer.writerow([c.encode('utf-8') for c in d]) print "saved as", os.path.basename(csvfile) print except Exception: logging.exception("") print "couldn't parse", g_fname, "see log for more info" print elif args[''] == 'stream': print "using the stream method" for g in glob_pdf: g_fname = os.path.basename(g) g_froot, __ = os.path.splitext(g) try: data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'], char_margin=float(args['--cmargin']), line_margin=float(args['--lmargin']), word_margin=float(args['--wmargin']), debug=args['--debug']) if data is None: print continue csvfile = g_froot + '.csv' with open(csvfile, 'w') as outfile: writer = csv.writer(outfile) for d in data: writer.writerow([c.encode('utf-8') for c in d]) print "saved as", os.path.basename(csvfile) print except Exception: logging.exception("") print "couldn't parse", g_fname, "see log for more info" print if args['--log']: if args['--output']: shutil.copy(logfname, args['--output']) else: shutil.copy(logfname, fdir) if args['--debug'] not in [None, False]: print "See 'camelot -h' for various parameters you can tweak." shutil.rmtree(tmpdir) sys.exit() glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort) if args['--format'] == 'csv': if len(glob_csv) == 1: if args['--output']: shutil.copy(glob_csv[0], args['--output']) else: shutil.copy(glob_csv[0], fdir) else: zipname = froot + '.zip' zippath = os.path.join(tmpdir, zipname) print "zipping 'em up" with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip: for g in glob_csv: myzip.write(g, os.path.join(froot, os.path.basename(g))) if args['--output']: shutil.copy(zippath, args['--output']) else: shutil.copy(zippath, fdir) print elif args['--format'] == 'xlsx': from pyexcel_xlsx import save_data from collections import OrderedDict data = OrderedDict() for c in glob_csv: c_fname = os.path.basename(c) c_froot, __ = os.path.splitext(c) print "adding", c_fname, "to excel file" with open(c, 'r') as csvfile: reader = csv.reader(csvfile) c_froot, __ = os.path.splitext(c_fname) data.update({c_froot: [row for row in reader]}) xlsxname = froot + '.xlsx' xlsxpath = os.path.join(tmpdir, xlsxname) save_data(xlsxpath, data) if args['--output']: shutil.copy(xlsxpath, args['--output']) else: shutil.copy(xlsxpath, fdir) print print "saved as", xlsxname print "cleaning up..." shutil.rmtree(tmpdir) print "finished in", time.time() - start_time, "seconds" logging.info("Time taken for " + fname + ": " + str(time.time() - start_time) + " seconds")