camelot-py/tools/camelot

#!/usr/bin/env python2
from __future__ import print_function
import os
import sys
import time
import logging

from docopt import docopt
from PyPDF2 import PdfFileReader

from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream


doc = """
Camelot: PDF parsing made simpler!

usage:
 camelot [options] <method> [<args>...]

options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
 -l, --log                 Print log to file.
 -V, --verbose             Verbose.
 -o, --output <directory>  Output directory.

camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.

See 'camelot <method> -h' for more information on a specific method.
"""

lattice_doc = """
Lattice method looks for lines between data to form a table.

usage:
 camelot lattice [options] [--] <file>

options:
 -F, --fill <fill>    Fill data in horizontal and/or vertical spanning
                      cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
"""

stream_doc = """
Stream method looks for spaces between data to form a table.

usage:
 camelot stream [options] [--] <file>

options:
 -n, --ncols <ncols>      Number of columns. [default: 0]
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
 -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
                          grouped together to form a word. [default: 2.0]
 -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
                          grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
                          if distance between words is greater than word
                          margin. [default: 0.1]
 -d, --debug              Debug by visualizing textboxes.
"""


def convert_to_html(table):
    html = ''
    html = ''.join([html, '<table border="1">\n'])
    for row in table:
        html = ''.join([html, ' <tr>\n'])
        for data in row:
            html = ''.join([html, '  <td>', data, '</td>\n'])
        html = ''.join([html, ' </tr>\n'])
    html = ''.join([html, '</table>\n'])
    return html


def write_to_disk(data, f='csv', output=None, filename=None):
    # raise something if filename and/or output are None
    fname = os.path.basename(filename)
    froot, __ = os.path.splitext(fname)
    if f in ['csv', 'tsv']:
        import csv
        delimiter = ',' if f == 'csv' else '\t'
        for page in sorted(data):
            for table in range(len(data[page])):
                dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
                with open(os.path.join(output, dsvname), 'w') as outfile:
                    writer = csv.writer(
                        outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
                    for row in data[page][table]:
                        writer.writerow(row)
    elif f == 'html':
        htmlname = '{}.html'.format(froot)
        for page in sorted(data):
            for table in range(len(data[page])):
                with open(os.path.join(output, htmlname), 'a') as htmlfile:
                    htmlfile.write(convert_to_html(data[page][table]))
    elif f == 'json':
        import json
        with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
                as jsonfile:
            json.dump(data, jsonfile)
    elif f == 'xlsx':
        try:
            from pyexcel_xlsx import save_data
            from collections import OrderedDict
            xlsx_data = OrderedDict()
            for page in sorted(data):
                for table in range(len(data[page])):
                    sheet_name = '{0}_table_{1}'.format(page, table + 1)
                    xlsx_data.update({sheet_name:
                                      [row for row in data[page][table]]})
            save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
        except ImportError:
            print("link to install docs")


if __name__ == '__main__':
    start_time = time.time()

    args = docopt(doc, version='0.1', options_first=True)
    argv = [args['<method>']] + args['<args>']
    if args['<method>'] == 'lattice':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))

    vprint = print if args['--verbose'] else lambda *a, **k: None
    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
    logname, __ = os.path.splitext(filename)
    logname += '.log'

    if args['--log']:
        if args['--output']:
            logname = os.path.join(args['--output'], os.path.basename(logname))
            logging.basicConfig(
                filename=logname, filemode='w', level=logging.DEBUG)
        else:
            logging.basicConfig(
                filename=logname, filemode='w', level=logging.DEBUG)

    p = []
    if args['--pages'] == '1':
        p.append({'start': 1, 'end': 1})
    else:
        if args['--pages'] == 'all':
            infile = PdfFileReader(open(filename, 'rb'), strict=False)
            p.append({'start': 1, 'end': infile.getNumPages()})
        else:
            for r in args['--pages'].split(','):
                if '-' in r:
                    a, b = r.split('-')
                    p.append({'start': int(a), 'end': int(b)})
                else:
                    p.append({'start': int(r), 'end': int(r)})

    if args['<method>'] == 'lattice':
        try:
            extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
                                fill=args['--fill'],
                                scale=int(args['--scale']),
                                jtol=int(args['--jtol']),
                                mtol=int(args['--mtol']),
                                invert=args['--invert'],
                                debug=args['--debug'],
                                verbose=args['--verbose'])
            data = extractor.get_tables()
            if args['--debug']:
                extractor.plot_geometry(args['--debug'])
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
            extractor = Stream(Pdf(filename, pagenos=p,
                                   char_margin=float(args['--cmargin']),
                                   line_margin=float(args['--lmargin']),
                                   word_margin=float(args['--wmargin']),
                                   clean=True),
                               ncolumns=int(args['--ncols']),
                               columns=args['--columns'],
                               ytol=int(args['--ytol']),
                               debug=args['--debug'],
                               verbose=args['--verbose'])
            data = extractor.get_tables()
            if args['--debug']:
                extractor.plot_text()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()

    if data is None:
        print("See 'camelot <method> -h' for various parameters you can tweak.")
    else:
        output = filedir if args['--output'] is None else args['--output']
        write_to_disk(data, f=args['--format'],
                      output=output, filename=filename)

    vprint("finished in", time.time() - start_time, "seconds")
    logging.info("Time taken: " + str(time.time() - start_time) + " seconds")