camelot-py/tools/camelot

#!/usr/bin/env python2
import os
import sys
import time
import logging

from docopt import docopt
from PyPDF2 import PdfFileReader

from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream


doc = """
Camelot: PDF parsing made simpler!

usage:
 camelot [options] <method> [<args>...]

options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
 -l, --log                 Print log to file.
 -o, --output <directory>  Output directory.

camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.

See 'camelot <method> -h' for more information on a specific method.
"""

lattice_doc = """
Lattice method looks for lines between data to form a table.

usage:
 camelot lattice [options] [--] <file>

options:
 -F, --fill <fill>    Fill data in horizontal and/or vertical spanning
                      cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
"""

stream_doc = """
Stream method looks for spaces between data to form a table.

usage:
 camelot stream [options] [--] <file>

options:
 -n, --ncols <ncols>      Number of columns. [default: 0]
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
 -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
                          grouped together to form a word. [default: 2.0]
 -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
                          grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
                          if distance between words is greater than word
                          margin. [default: 0.1]
 -d, --debug              Debug by visualizing textboxes.
"""


def convert_to_html(table):
    html = ''
    html = ''.join([html, '<table border="1">\n'])
    for row in table:
        html = ''.join([html, ' <tr>\n'])
        for data in row:
            html = ''.join([html, '  <td>', data, '</td>\n'])
        html = ''.join([html, ' </tr>\n'])
    html = ''.join([html, '</table>\n'])
    return html


def write_to_disk(data, f='csv', output=None, filename=None):
    # raise something if filename and/or output are None
    fname = os.path.basename(filename)
    froot, __ = os.path.splitext(fname)
    if f in ['csv', 'tsv']:
        import csv
        delimiter = ',' if f == 'csv' else '\t'
        for page in sorted(data):
            for table in range(len(data[page])):
                dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
                with open(os.path.join(output, dsvname), 'w') as outfile:
                    writer = csv.writer(
                        outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
                    for row in data[page][table]:
                        writer.writerow(row)
    elif f == 'html':
        htmlname = '{}.html'.format(froot)
        for page in sorted(data):
            for table in range(len(data[page])):
                with open(os.path.join(output, htmlname), 'a') as htmlfile:
                    htmlfile.write(convert_to_html(data[page][table]))
    elif f == 'json':
        import json
        with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
                as jsonfile:
            json.dump(data, jsonfile)
    elif f == 'xlsx':
        try:
            from pyexcel_xlsx import save_data
            from collections import OrderedDict
            xlsx_data = OrderedDict()
            for page in sorted(data):
                for table in range(len(data[page])):
                    sheet_name = '{0}_table_{1}'.format(page, table + 1)
                    xlsx_data.update({sheet_name:
                                      [row for row in data[page][table]]})
            save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
        except ImportError:
            print "link to install docs"


if __name__ == '__main__':
    start_time = time.time()

    args = docopt(doc, version='0.1', options_first=True)
    argv = [args['<method>']] + args['<args>']
    if args['<method>'] == 'lattice':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))

    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
    logname, __ = os.path.splitext(filename)
    logname += '.log'

    if args['--log']:
        if args['--output']:
            logname = os.path.join(args['--output'], os.path.basename(logname))
            logging.basicConfig(
                filename=logname, filemode='w', level=logging.DEBUG)
        else:
            logging.basicConfig(
                filename=logname, filemode='w', level=logging.DEBUG)

    p = []
    if args['--pages'] == '1':
        p.append({'start': 1, 'end': 1})
    else:
        if args['--pages'] == 'all':
            infile = PdfFileReader(open(filename, 'rb'), strict=False)
            p.append({'start': 1, 'end': infile.getNumPages()})
        else:
            for r in args['--pages'].split(','):
                if '-' in r:
                    a, b = r.split('-')
                    p.append({'start': int(a), 'end': int(b)})
                else:
                    p.append({'start': int(r), 'end': int(r)})

    if args['<method>'] == 'lattice':
        try:
            extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
                                fill=args['--fill'],
                                scale=int(args['--scale']),
                                jtol=int(args['--jtol']),
                                mtol=int(args['--mtol']),
                                invert=args['--invert'],
                                debug=args['--debug'])
            data = extractor.get_tables()
            if args['--debug']:
                extractor.plot_geometry(args['--debug'])
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
            extractor = Stream(Pdf(filename, pagenos=p,
                                   char_margin=float(args['--cmargin']),
                                   line_margin=float(args['--lmargin']),
                                   word_margin=float(args['--wmargin']),
                                   clean=True),
                               ncolumns=int(args['--ncols']),
                               columns=args['--columns'],
                               ytol=int(args['--ytol']),
                               debug=args['--debug'])
            data = extractor.get_tables()
            if args['--debug']:
                extractor.plot_text()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()

    if data is None:
        print "See 'camelot <method> -h' for various parameters you can tweak."
    else:
        output = filedir if args['--output'] is None else args['--output']
        write_to_disk(data, f=args['--format'],
                      output=output, filename=filename)

    print "finished in", time.time() - start_time, "seconds"
    logging.info("Time taken: " + str(time.time() - start_time) + " seconds")