camelot-py/tools/camelot

220 lines
8.3 KiB
Python
Executable File

#!/usr/bin/env python2
from __future__ import print_function
import os
import sys
import time
import logging
from docopt import docopt
from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream
doc = """
Camelot: PDF parsing made simpler!
usage:
camelot [options] <method> [<args>...]
options:
-h, --help Show this screen.
-v, --version Show version.
-p, --pages <pageno> Comma-separated list of page numbers.
Example: -p 1,3-6,10 [default: 1]
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
-l, --log Print log to file.
-V, --verbose Verbose.
-o, --output <directory> Output directory.
camelot methods:
lattice Looks for lines between data.
stream Looks for spaces between data.
See 'camelot <method> -h' for more information on a specific method.
"""
lattice_doc = """
Lattice method looks for lines between data to form a table.
usage:
camelot lattice [options] [--] <file>
options:
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
cells. Example: -F h, -F v, -F hv
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-j, --jtol <jtol> Tolerance to account for when comparing joint
and line coordinates. [default: 2]
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-i, --invert Invert pdf image to make sure that lines are
in foreground.
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
stream_doc = """
Stream method looks for spaces between data to form a table.
usage:
camelot stream [options] [--] <file>
options:
-n, --ncols <ncols> Number of columns. [default: 0]
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-y, --ytol <ytol> Tolerance to account for when grouping rows
together. [default: 2]
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
grouped together to form a word. [default: 2.0]
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
grouped together to form a textbox. [default: 0.5]
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
if distance between words is greater than word
margin. [default: 0.1]
-d, --debug Debug by visualizing textboxes.
"""
def convert_to_html(table):
html = ''
html = ''.join([html, '<table border="1">\n'])
for row in table:
html = ''.join([html, ' <tr>\n'])
for data in row:
html = ''.join([html, ' <td>', data, '</td>\n'])
html = ''.join([html, ' </tr>\n'])
html = ''.join([html, '</table>\n'])
return html
def write_to_disk(data, f='csv', output=None, filename=None):
# raise something if filename and/or output are None
fname = os.path.basename(filename)
froot, __ = os.path.splitext(fname)
if f in ['csv', 'tsv']:
import csv
delimiter = ',' if f == 'csv' else '\t'
for page in sorted(data):
for table in range(len(data[page])):
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
with open(os.path.join(output, dsvname), 'w') as outfile:
writer = csv.writer(
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
for row in data[page][table]:
writer.writerow(row)
elif f == 'html':
htmlname = '{}.html'.format(froot)
for page in sorted(data):
for table in range(len(data[page])):
with open(os.path.join(output, htmlname), 'a') as htmlfile:
htmlfile.write(convert_to_html(data[page][table]))
elif f == 'json':
import json
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
as jsonfile:
json.dump(data, jsonfile)
elif f == 'xlsx':
try:
from pyexcel_xlsx import save_data
from collections import OrderedDict
xlsx_data = OrderedDict()
for page in sorted(data):
for table in range(len(data[page])):
sheet_name = '{0}_table_{1}'.format(page, table + 1)
xlsx_data.update({sheet_name:
[row for row in data[page][table]]})
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
except ImportError:
print("link to install docs")
if __name__ == '__main__':
start_time = time.time()
args = docopt(doc, version='0.1', options_first=True)
argv = [args['<method>']] + args['<args>']
if args['<method>'] == 'lattice':
args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv))
vprint = print if args['--verbose'] else lambda *a, **k: None
filename = args['<file>']
filedir = os.path.dirname(args['<file>'])
logname, __ = os.path.splitext(filename)
logname += '.log'
if args['--log']:
if args['--output']:
logname = os.path.join(args['--output'], os.path.basename(logname))
logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG)
else:
logging.basicConfig(
filename=logname, filemode='w', level=logging.DEBUG)
p = []
if args['--pages'] == '1':
p.append({'start': 1, 'end': 1})
else:
if args['--pages'] == 'all':
infile = PdfFileReader(open(filename, 'rb'), strict=False)
p.append({'start': 1, 'end': infile.getNumPages()})
else:
for r in args['--pages'].split(','):
if '-' in r:
a, b = r.split('-')
p.append({'start': int(a), 'end': int(b)})
else:
p.append({'start': int(r), 'end': int(r)})
if args['<method>'] == 'lattice':
try:
extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
fill=args['--fill'],
scale=int(args['--scale']),
jtol=int(args['--jtol']),
mtol=int(args['--mtol']),
invert=args['--invert'],
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
if args['--debug']:
extractor.plot_geometry(args['--debug'])
except Exception as e:
logging.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'stream':
try:
extractor = Stream(Pdf(filename, pagenos=p,
char_margin=float(args['--cmargin']),
line_margin=float(args['--lmargin']),
word_margin=float(args['--wmargin']),
clean=True),
ncolumns=int(args['--ncols']),
columns=args['--columns'],
ytol=int(args['--ytol']),
debug=args['--debug'],
verbose=args['--verbose'])
data = extractor.get_tables()
if args['--debug']:
extractor.plot_text()
except Exception as e:
logging.exception(e.message, exc_info=True)
sys.exit()
if data is None:
print("See 'camelot <method> -h' for various parameters you can tweak.")
else:
output = filedir if args['--output'] is None else args['--output']
write_to_disk(data, f=args['--format'],
output=output, filename=filename)
vprint("finished in", time.time() - start_time, "seconds")
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")