259 lines
9.5 KiB
Python
Executable File
259 lines
9.5 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
import os
|
|
import re
|
|
import csv
|
|
import sys
|
|
import glob
|
|
import time
|
|
import shutil
|
|
import logging
|
|
import zipfile
|
|
import tempfile
|
|
import subprocess
|
|
from docopt import docopt
|
|
from werkzeug.utils import secure_filename
|
|
|
|
from lattice import lattice
|
|
from stream import stream
|
|
|
|
|
|
doc = """
|
|
camelot parses tables from PDFs!
|
|
|
|
usage:
|
|
camelot.py [options] <method> [<args>...]
|
|
|
|
options:
|
|
-h, --help Show this screen.
|
|
-v, --version Show version.
|
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
|
Example: -p 1,3-6,10 [default: 1]
|
|
-f, --format <format> Output format. (csv,xlsx) [default: csv]
|
|
-l, --log Print log to file.
|
|
-o, --output <directory> Output directory.
|
|
|
|
camelot methods:
|
|
lattice Looks for lines between data.
|
|
stream Looks for spaces between data.
|
|
|
|
See 'camelot <method> -h' for more information on a specific method.
|
|
"""
|
|
|
|
lattice_doc = """
|
|
Lattice method looks for lines between data to form a table.
|
|
|
|
usage:
|
|
camelot.py lattice [options] [--] <file>
|
|
|
|
options:
|
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
|
cells. Example: -F h, -F v, -F hv
|
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
|
smaller lines being detected. [default: 15]
|
|
-j, --jtol <jtol> Tolerance to account for when comparing joint
|
|
and line coordinates. [default: 2]
|
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
|
which are very close. [default: 2]
|
|
-i, --invert Invert pdf image to make sure that lines are
|
|
in foreground.
|
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
|
(contour,line,joint,table) Example: -d table
|
|
"""
|
|
|
|
stream_doc = """
|
|
Stream method looks for spaces between data to form a table.
|
|
|
|
usage:
|
|
camelot.py stream [options] [--] <file>
|
|
|
|
options:
|
|
-n, --ncols <ncols> Number of columns. [default: 0]
|
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
|
Example: -c 10.1,20.2,30.3
|
|
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
|
grouped together to form a word. [default: 2.0]
|
|
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
grouped together to form a textbox. [default: 0.5]
|
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
if distance between words is greater than word
|
|
margin. [default: 0.1]
|
|
-d, --debug Debug by visualizing textboxes.
|
|
"""
|
|
|
|
pno = re.compile(r'\d+')
|
|
|
|
|
|
def filesort(filepath):
|
|
filename = os.path.basename(filepath)
|
|
num = pno.findall(filename)
|
|
if len(num) == 2:
|
|
return (int(num[0]), int(num[1]))
|
|
else:
|
|
return (int(num[0]), 0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
start_time = time.time()
|
|
tmpdir = tempfile.mkdtemp()
|
|
|
|
args = docopt(doc, version='0.1', options_first=True)
|
|
argv = [args['<method>']] + args['<args>']
|
|
if args['<method>'] == 'lattice':
|
|
args.update(docopt(lattice_doc, argv=argv))
|
|
elif args['<method>'] == 'stream':
|
|
args.update(docopt(stream_doc, argv=argv))
|
|
|
|
if args['--pages']:
|
|
if args['--pages'] == ['all']:
|
|
p = args['--pages']
|
|
else:
|
|
p = []
|
|
for r in args['--pages'].split(','):
|
|
if '-' in r:
|
|
a, b = r.split('-')
|
|
a, b = int(a), int(b)
|
|
p.extend([str(i) for i in range(a, b + 1)])
|
|
else:
|
|
p.extend([str(r)])
|
|
else:
|
|
p = ['1']
|
|
p = sorted(set(p))
|
|
|
|
fname = os.path.basename(args['<file>'])
|
|
fname = secure_filename(fname)
|
|
fdir = os.path.dirname(args['<file>'])
|
|
froot, fext = os.path.splitext(fname)
|
|
if fext.lower() != '.pdf':
|
|
print "camelot can parse only pdfs right now"
|
|
sys.exit()
|
|
|
|
logfname = os.path.join(tmpdir, froot + '.log')
|
|
logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
|
|
|
|
shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
|
|
print "separating pdf into pages"
|
|
print
|
|
if p == ['all']:
|
|
subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
|
|
'pg-%d.pdf')])
|
|
else:
|
|
for page in p:
|
|
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
|
|
os.path.join(tmpdir, 'pg-%s.pdf' % page)])
|
|
|
|
glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
|
|
if args['<method>'] == 'lattice':
|
|
print "using the lattice method"
|
|
for g in glob_pdf:
|
|
g_fname = os.path.basename(g)
|
|
g_froot, __ = os.path.splitext(g)
|
|
print "converting %s to image" % g_fname
|
|
os.system(' '.join(['convert', '-density', '300',
|
|
g, '-depth', '8', g_froot + '.png']))
|
|
try:
|
|
data = lattice(g, f=args['--fill'], s=int(args['--scale']),
|
|
jtol=int(args['--jtol']), mtol=int(args['--mtol']),
|
|
invert=args['--invert'], debug=args['--debug'])
|
|
if data is None:
|
|
print
|
|
print "See 'camelot lattice -h' for various parameters you can tweak."
|
|
sys.exit()
|
|
for k in sorted(data.keys()):
|
|
csvfile = g_froot + '_%s.csv' % k
|
|
with open(csvfile, 'w') as outfile:
|
|
writer = csv.writer(outfile)
|
|
for d in data[k]:
|
|
writer.writerow([c.encode('utf-8') for c in d])
|
|
print "saved as", os.path.basename(csvfile)
|
|
print
|
|
except Exception:
|
|
logging.exception("")
|
|
print "couldn't parse", g_fname, "see log for more info"
|
|
print
|
|
elif args['<method>'] == 'stream':
|
|
print "using the stream method"
|
|
for g in glob_pdf:
|
|
g_fname = os.path.basename(g)
|
|
g_froot, __ = os.path.splitext(g)
|
|
try:
|
|
data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
|
|
char_margin=float(args['--cmargin']),
|
|
line_margin=float(args['--lmargin']),
|
|
word_margin=float(args['--wmargin']),
|
|
debug=args['--debug'])
|
|
if data is None:
|
|
print
|
|
print "See 'camelot stream -h' for various parameters you can tweak."
|
|
sys.exit()
|
|
csvfile = g_froot + '.csv'
|
|
with open(csvfile, 'w') as outfile:
|
|
writer = csv.writer(outfile)
|
|
for d in data:
|
|
writer.writerow([c.encode('utf-8') for c in d])
|
|
print "saved as", os.path.basename(csvfile)
|
|
print
|
|
except Exception:
|
|
logging.exception("")
|
|
print "couldn't parse", g_fname, "see log for more info"
|
|
print
|
|
|
|
glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
|
|
if args['--format'] == 'csv':
|
|
if len(glob_csv) == 1:
|
|
if args['--output']:
|
|
shutil.copy(glob_csv[0], args['--output'])
|
|
if args['--log']:
|
|
shutil.copy(logfname, args['--output'])
|
|
else:
|
|
shutil.copy(glob_csv[0], fdir)
|
|
if args['--log']:
|
|
shutil.copy(zippath, fdir)
|
|
else:
|
|
zipname = froot + '.zip'
|
|
zippath = os.path.join(tmpdir, zipname)
|
|
print "zipping 'em up"
|
|
with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
|
|
for g in glob_csv:
|
|
myzip.write(g, os.path.join(froot, os.path.basename(g)))
|
|
if args['--output']:
|
|
shutil.copy(zippath, args['--output'])
|
|
if args['--log']:
|
|
shutil.copy(logfname, args['--output'])
|
|
else:
|
|
shutil.copy(zippath, fdir)
|
|
if args['--log']:
|
|
shutil.copy(zippath, fdir)
|
|
print
|
|
elif args['--format'] == 'xlsx':
|
|
from pyexcel_xlsx import save_data
|
|
from collections import OrderedDict
|
|
data = OrderedDict()
|
|
for c in glob_csv:
|
|
c_fname = os.path.basename(c)
|
|
c_froot, __ = os.path.splitext(c)
|
|
print "adding", c_fname, "to excel file"
|
|
with open(c, 'r') as csvfile:
|
|
reader = csv.reader(csvfile)
|
|
c_froot, __ = os.path.splitext(c_fname)
|
|
data.update({c_froot: [row for row in reader]})
|
|
xlsxname = froot + '.xlsx'
|
|
xlsxpath = os.path.join(tmpdir, xlsxname)
|
|
save_data(xlsxpath, data)
|
|
if args['--output']:
|
|
shutil.copy(xlsxpath, args['--output'])
|
|
if args['--log']:
|
|
shutil.copy(logfname, args['--output'])
|
|
else:
|
|
shutil.copy(xlsxpath, fdir)
|
|
if args['--log']:
|
|
shutil.copy(zippath, fdir)
|
|
print
|
|
print "saved as", xlsxname
|
|
|
|
print "cleaning up..."
|
|
shutil.rmtree(tmpdir)
|
|
|
|
print "finished in", time.time() - start_time, "seconds"
|
|
logging.info("Time taken for " + fname + ": " +
|
|
str(time.time() - start_time) + " seconds")
|