220 lines
8.3 KiB
Python
Executable File
220 lines
8.3 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
from __future__ import print_function
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
|
|
from docopt import docopt
|
|
from PyPDF2 import PdfFileReader
|
|
|
|
from camelot.pdf import Pdf
|
|
from camelot.lattice import Lattice
|
|
from camelot.stream import Stream
|
|
|
|
|
|
doc = """
|
|
Camelot: PDF parsing made simpler!
|
|
|
|
usage:
|
|
camelot [options] <method> [<args>...]
|
|
|
|
options:
|
|
-h, --help Show this screen.
|
|
-v, --version Show version.
|
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
|
Example: -p 1,3-6,10 [default: 1]
|
|
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
|
-l, --log Print log to file.
|
|
-V, --verbose Verbose.
|
|
-o, --output <directory> Output directory.
|
|
|
|
camelot methods:
|
|
lattice Looks for lines between data.
|
|
stream Looks for spaces between data.
|
|
|
|
See 'camelot <method> -h' for more information on a specific method.
|
|
"""
|
|
|
|
lattice_doc = """
|
|
Lattice method looks for lines between data to form a table.
|
|
|
|
usage:
|
|
camelot lattice [options] [--] <file>
|
|
|
|
options:
|
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
|
cells. Example: -F h, -F v, -F hv
|
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
|
smaller lines being detected. [default: 15]
|
|
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
|
and line coordinates. [default: 2]
|
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
|
which are very close. [default: 2]
|
|
-i, --invert Invert pdf image to make sure that lines are
|
|
in foreground.
|
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
|
(contour,line,joint,table) Example: -d table
|
|
"""
|
|
|
|
stream_doc = """
|
|
Stream method looks for spaces between data to form a table.
|
|
|
|
usage:
|
|
camelot stream [options] [--] <file>
|
|
|
|
options:
|
|
-n, --ncols <ncols> Number of columns. [default: 0]
|
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
|
Example: -c 10.1,20.2,30.3
|
|
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
|
together. [default: 2]
|
|
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
|
grouped together to form a word. [default: 2.0]
|
|
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
grouped together to form a textbox. [default: 0.5]
|
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
if distance between words is greater than word
|
|
margin. [default: 0.1]
|
|
-d, --debug Debug by visualizing textboxes.
|
|
"""
|
|
|
|
|
|
def convert_to_html(table):
|
|
html = ''
|
|
html = ''.join([html, '<table border="1">\n'])
|
|
for row in table:
|
|
html = ''.join([html, ' <tr>\n'])
|
|
for data in row:
|
|
html = ''.join([html, ' <td>', data, '</td>\n'])
|
|
html = ''.join([html, ' </tr>\n'])
|
|
html = ''.join([html, '</table>\n'])
|
|
return html
|
|
|
|
|
|
def write_to_disk(data, f='csv', output=None, filename=None):
|
|
# raise something if filename and/or output are None
|
|
fname = os.path.basename(filename)
|
|
froot, __ = os.path.splitext(fname)
|
|
if f in ['csv', 'tsv']:
|
|
import csv
|
|
delimiter = ',' if f == 'csv' else '\t'
|
|
for page in sorted(data):
|
|
for table in range(len(data[page])):
|
|
dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
|
|
with open(os.path.join(output, dsvname), 'w') as outfile:
|
|
writer = csv.writer(
|
|
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
|
for row in data[page][table]:
|
|
writer.writerow(row)
|
|
elif f == 'html':
|
|
htmlname = '{}.html'.format(froot)
|
|
for page in sorted(data):
|
|
for table in range(len(data[page])):
|
|
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
|
htmlfile.write(convert_to_html(data[page][table]))
|
|
elif f == 'json':
|
|
import json
|
|
with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
|
|
as jsonfile:
|
|
json.dump(data, jsonfile)
|
|
elif f == 'xlsx':
|
|
try:
|
|
from pyexcel_xlsx import save_data
|
|
from collections import OrderedDict
|
|
xlsx_data = OrderedDict()
|
|
for page in sorted(data):
|
|
for table in range(len(data[page])):
|
|
sheet_name = '{0}_table_{1}'.format(page, table + 1)
|
|
xlsx_data.update({sheet_name:
|
|
[row for row in data[page][table]]})
|
|
save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
|
|
except ImportError:
|
|
print("link to install docs")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
start_time = time.time()
|
|
|
|
args = docopt(doc, version='0.1', options_first=True)
|
|
argv = [args['<method>']] + args['<args>']
|
|
if args['<method>'] == 'lattice':
|
|
args.update(docopt(lattice_doc, argv=argv))
|
|
elif args['<method>'] == 'stream':
|
|
args.update(docopt(stream_doc, argv=argv))
|
|
|
|
vprint = print if args['--verbose'] else lambda *a, **k: None
|
|
filename = args['<file>']
|
|
filedir = os.path.dirname(args['<file>'])
|
|
logname, __ = os.path.splitext(filename)
|
|
logname += '.log'
|
|
|
|
if args['--log']:
|
|
if args['--output']:
|
|
logname = os.path.join(args['--output'], os.path.basename(logname))
|
|
logging.basicConfig(
|
|
filename=logname, filemode='w', level=logging.DEBUG)
|
|
else:
|
|
logging.basicConfig(
|
|
filename=logname, filemode='w', level=logging.DEBUG)
|
|
|
|
p = []
|
|
if args['--pages'] == '1':
|
|
p.append({'start': 1, 'end': 1})
|
|
else:
|
|
if args['--pages'] == 'all':
|
|
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
|
p.append({'start': 1, 'end': infile.getNumPages()})
|
|
else:
|
|
for r in args['--pages'].split(','):
|
|
if '-' in r:
|
|
a, b = r.split('-')
|
|
p.append({'start': int(a), 'end': int(b)})
|
|
else:
|
|
p.append({'start': int(r), 'end': int(r)})
|
|
|
|
if args['<method>'] == 'lattice':
|
|
try:
|
|
extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
|
|
fill=args['--fill'],
|
|
scale=int(args['--scale']),
|
|
jtol=int(args['--jtol']),
|
|
mtol=int(args['--mtol']),
|
|
invert=args['--invert'],
|
|
debug=args['--debug'],
|
|
verbose=args['--verbose'])
|
|
data = extractor.get_tables()
|
|
if args['--debug']:
|
|
extractor.plot_geometry(args['--debug'])
|
|
except Exception as e:
|
|
logging.exception(e.message, exc_info=True)
|
|
sys.exit()
|
|
elif args['<method>'] == 'stream':
|
|
try:
|
|
extractor = Stream(Pdf(filename, pagenos=p,
|
|
char_margin=float(args['--cmargin']),
|
|
line_margin=float(args['--lmargin']),
|
|
word_margin=float(args['--wmargin']),
|
|
clean=True),
|
|
ncolumns=int(args['--ncols']),
|
|
columns=args['--columns'],
|
|
ytol=int(args['--ytol']),
|
|
debug=args['--debug'],
|
|
verbose=args['--verbose'])
|
|
data = extractor.get_tables()
|
|
if args['--debug']:
|
|
extractor.plot_text()
|
|
except Exception as e:
|
|
logging.exception(e.message, exc_info=True)
|
|
sys.exit()
|
|
|
|
if data is None:
|
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
|
else:
|
|
output = filedir if args['--output'] is None else args['--output']
|
|
write_to_disk(data, f=args['--format'],
|
|
output=output, filename=filename)
|
|
|
|
vprint("finished in", time.time() - start_time, "seconds")
|
|
logging.info("Time taken: " + str(time.time() - start_time) + " seconds")
|