camelot-py/camelot.py

#!/usr/bin/env python2
import os
import re
import csv
import sys
import glob
import time
import shutil
import logging
import zipfile
import tempfile
import subprocess
from docopt import docopt
from werkzeug.utils import secure_filename

from lattice import lattice
from stream import stream


doc = """
camelot parses tables from PDFs!

usage:
 camelot.py [options] <method> [<args>...]

options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -f, --format <format>     Output format. (csv,xlsx) [default: csv]
 -l, --log                 Print log to file.
 -o, --output <directory>  Output directory.

camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.

See 'camelot <method> -h' for more information on a specific method.
"""

lattice_doc = """
Lattice method looks for lines between data to form a table.

usage:
 camelot.py lattice [options] [--] <file>

options:
 -F, --fill <fill>      Fill data in horizontal and/or vertical spanning
                        cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>    Scaling factor. Large scaling factor leads to
                        smaller lines being detected. [default: 15]
 -j, --jtol <jtol>      Tolerance to account for when comparing joint
                        and line coordinates. [default: 2]
 -m, --mtol <mtol>      Tolerance to account for when merging lines
                        which are very close. [default: 2]
 -i, --invert           Invert pdf image to make sure that lines are
                        in foreground.
 -d, --debug <debug>    Debug by visualizing pdf geometry.
                        (contour,line,joint,table) Example: -d table
"""

stream_doc = """
Stream method looks for spaces between data to form a table.

usage:
 camelot.py stream [options] [--] <file>

options:
 -n, --ncols <ncols>      Number of columns. [default: 0]
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
                          grouped together to form a word. [default: 2.0]
 -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
                          grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
                          if distance between words is greater than word
                          margin. [default: 0.1]
 -d, --debug              Debug by visualizing textboxes.
"""

pno = re.compile(r'\d+')


def filesort(filepath):
    filename = os.path.basename(filepath)
    num = pno.findall(filename)
    if len(num) == 2:
        return (int(num[0]), int(num[1]))
    else:
        return (int(num[0]), 0)


if __name__ == '__main__':
    start_time = time.time()
    tmpdir = tempfile.mkdtemp()

    args = docopt(doc, version='0.1', options_first=True)
    argv = [args['<method>']] + args['<args>']
    if args['<method>'] == 'lattice':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))

    if args['--pages']:
        if args['--pages'] == ['all']:
            p = args['--pages']
        else:
            p = []
            for r in args['--pages'].split(','):
                if '-' in r:
                    a, b = r.split('-')
                    a, b = int(a), int(b)
                    p.extend([str(i) for i in range(a, b + 1)])
                else:
                    p.extend([str(r)])
    else:
        p = ['1']
    p = sorted(set(p))

    fname = os.path.basename(args['<file>'])
    fname = secure_filename(fname)
    fdir = os.path.dirname(args['<file>'])
    froot, fext = os.path.splitext(fname)
    if fext.lower() != '.pdf':
        print "camelot can parse only pdfs right now"
        sys.exit()

    logfname = os.path.join(tmpdir, froot + '.log')
    logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)

    shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
    print "separating pdf into pages"
    print
    if p == ['all']:
        subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
                        'pg-%d.pdf')])
    else:
        for page in p:
            subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
                            os.path.join(tmpdir, 'pg-%s.pdf' % page)])

    glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
    if args['<method>'] == 'lattice':
        print "using the lattice method"
        for g in glob_pdf:
            g_fname = os.path.basename(g)
            g_froot, __ = os.path.splitext(g)
            print "converting %s to image" % g_fname
            os.system(' '.join(['convert', '-density', '300',
                               g, '-depth', '8', g_froot + '.png']))
            try:
                data = lattice(g, f=args['--fill'], s=int(args['--scale']),
                               jtol=int(args['--jtol']), mtol=int(args['--mtol']),
                               invert=args['--invert'], debug=args['--debug'])
                if data is None:
                    print
                    print "See 'camelot lattice -h' for various parameters you can tweak."
                    sys.exit()
                for k in sorted(data.keys()):
                    csvfile = g_froot + '_%s.csv' % k
                    with open(csvfile, 'w') as outfile:
                        writer = csv.writer(outfile)
                        for d in data[k]:
                            writer.writerow([c.encode('utf-8') for c in d])
                        print "saved as", os.path.basename(csvfile)
                print
            except Exception:
                logging.exception("")
                print "couldn't parse", g_fname, "see log for more info"
                print
    elif args['<method>'] == 'stream':
        print "using the stream method"
        for g in glob_pdf:
            g_fname = os.path.basename(g)
            g_froot, __ = os.path.splitext(g)
            try:
                data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
                              char_margin=float(args['--cmargin']),
                              line_margin=float(args['--lmargin']),
                              word_margin=float(args['--wmargin']),
                              debug=args['--debug'])
                if data is None:
                    print
                    print "See 'camelot stream -h' for various parameters you can tweak."
                    sys.exit()
                csvfile = g_froot + '.csv'
                with open(csvfile, 'w') as outfile:
                    writer = csv.writer(outfile)
                    for d in data:
                        writer.writerow([c.encode('utf-8') for c in d])
                    print "saved as", os.path.basename(csvfile)
                    print
            except Exception:
                logging.exception("")
                print "couldn't parse", g_fname, "see log for more info"
                print

    glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
    if args['--format'] == 'csv':
        if len(glob_csv) == 1:
            if args['--output']:
                shutil.copy(glob_csv[0], args['--output'])
                if args['--log']:
                    shutil.copy(logfname, args['--output'])
            else:
                shutil.copy(glob_csv[0], fdir)
                if args['--log']:
                    shutil.copy(zippath, fdir)
        else:
            zipname = froot + '.zip'
            zippath = os.path.join(tmpdir, zipname)
            print "zipping 'em up"
            with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
                for g in glob_csv:
                    myzip.write(g, os.path.join(froot, os.path.basename(g)))
            if args['--output']:
                shutil.copy(zippath, args['--output'])
                if args['--log']:
                    shutil.copy(logfname, args['--output'])
            else:
                shutil.copy(zippath, fdir)
                if args['--log']:
                    shutil.copy(zippath, fdir)
            print
    elif args['--format'] == 'xlsx':
        from pyexcel_xlsx import save_data
        from collections import OrderedDict
        data = OrderedDict()
        for c in glob_csv:
            c_fname = os.path.basename(c)
            c_froot, __ = os.path.splitext(c)
            print "adding", c_fname, "to excel file"
            with open(c, 'r') as csvfile:
                reader = csv.reader(csvfile)
                c_froot, __ = os.path.splitext(c_fname)
                data.update({c_froot: [row for row in reader]})
        xlsxname = froot + '.xlsx'
        xlsxpath = os.path.join(tmpdir, xlsxname)
        save_data(xlsxpath, data)
        if args['--output']:
            shutil.copy(xlsxpath, args['--output'])
            if args['--log']:
                shutil.copy(logfname, args['--output'])
        else:
            shutil.copy(xlsxpath, fdir)
            if args['--log']:
                shutil.copy(zippath, fdir)
        print
        print "saved as", xlsxname

    print "cleaning up..."
    shutil.rmtree(tmpdir)

    print "finished in", time.time() - start_time, "seconds"
    logging.info("Time taken for " + fname + ": " +
                 str(time.time() - start_time) + " seconds")