478 lines
20 KiB
Python
Executable File
478 lines
20 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
from __future__ import print_function
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from docopt import docopt
|
|
from collections import Counter
|
|
import matplotlib.pyplot as plt
|
|
from PyPDF2 import PdfFileReader
|
|
|
|
from camelot.pdf import Pdf
|
|
from camelot.lattice import Lattice
|
|
from camelot.stream import Stream
|
|
|
|
|
|
doc = """
|
|
Camelot: PDF parsing made simpler!
|
|
|
|
usage:
|
|
camelot [options] <method> [<args>...]
|
|
|
|
options:
|
|
-h, --help Show this screen.
|
|
-v, --version Show version.
|
|
-V, --verbose Verbose.
|
|
-p, --pages <pageno> Comma-separated list of page numbers.
|
|
Example: -p 1,3-6,10 [default: 1]
|
|
-P, --parallel Parallelize the parsing process.
|
|
-f, --format <format> Output format. (csv,tsv,html,json,xlsx) [default: csv]
|
|
-l, --log Log to file.
|
|
-o, --output <directory> Output directory.
|
|
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
|
grouped together to form a word. [default: 1.0]
|
|
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
|
grouped together to form a textbox. [default: 0.5]
|
|
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
|
if distance between words is greater than word
|
|
margin. [default: 0.1]
|
|
-S, --print-stats List stats on the parsing process.
|
|
-T, --save-stats Save stats to a file.
|
|
-X, --plot <dist> Plot distributions. (page,all,rc)
|
|
|
|
camelot methods:
|
|
lattice Looks for lines between data.
|
|
stream Looks for spaces between data.
|
|
|
|
See 'camelot <method> -h' for more information on a specific method.
|
|
"""
|
|
|
|
lattice_doc = """
|
|
Lattice method looks for lines between text to form a table.
|
|
|
|
usage:
|
|
camelot lattice [-t <tarea>...] [-F <fill>...]
|
|
[-m <mtol>...] [options] [--] <file>
|
|
|
|
options:
|
|
-t, --tarea <tarea> Specific table areas to analyze.
|
|
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
|
cells. Example: -F h, -F v, -F hv
|
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
|
which are very close. [default: 2]
|
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
|
smaller lines being detected. [default: 15]
|
|
-i, --invert Invert pdf image to make sure that lines are
|
|
in foreground.
|
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
|
(contour,line,joint,table) Example: -d table
|
|
"""
|
|
|
|
stream_doc = """
|
|
Stream method looks for whitespaces between text to form a table.
|
|
|
|
usage:
|
|
camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
|
|
[-m <mtol>...] [options] [--] <file>
|
|
|
|
options:
|
|
-t, --tarea <tarea> Specific table areas to analyze.
|
|
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
|
Example: -c 10.1,20.2,30.3
|
|
-n, --ncols <ncols> Number of columns. [default: -1]
|
|
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
|
together. [default: 2]
|
|
-m, --mtol <mtol> Tolerance to account for when merging columns
|
|
together. [default: 0]
|
|
-d, --debug Debug by visualizing textboxes.
|
|
"""
|
|
|
|
|
|
def plot_table_barchart(r, c, p, pno, tno):
|
|
row_idx = [i + 1 for i, row in enumerate(r)]
|
|
col_idx = [i + 1 for i, col in enumerate(c)]
|
|
r_index = np.arange(len(r))
|
|
c_index = np.arange(len(c))
|
|
width = 0.7
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
plt.subplot(2, 1, 1)
|
|
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
|
|
plt.xlabel('row index')
|
|
plt.ylabel('number of non-empty cells in row')
|
|
plt.bar(r_index, r)
|
|
plt.xticks(r_index + width * 0.5, row_idx)
|
|
plt.ylim(0, len(c))
|
|
|
|
plt.subplot(2, 1, 2)
|
|
plt.xlabel('column index')
|
|
plt.ylabel('number of non-empty cells in column')
|
|
plt.bar(c_index, c)
|
|
plt.xticks(c_index + width * 0.5, col_idx)
|
|
plt.ylim(0, len(r))
|
|
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
|
|
|
|
|
|
def plot_all_barchart(data, output):
|
|
r_empty_cells = []
|
|
for page_number in data.keys():
|
|
page = data[page_number]
|
|
for table_number in page.keys():
|
|
table = page[table_number]
|
|
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
|
|
c = Counter(r_empty_cells)
|
|
if 0.0 not in c:
|
|
c.update({0.0: 0})
|
|
if 1.0 not in c:
|
|
c.update({1.0: 0})
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
plt.xlabel('percentage of non-empty cells in a row')
|
|
plt.ylabel('percentage of rows processed')
|
|
row_p = [count / float(sum(c.values())) for count in c.values()]
|
|
plt.bar(c.keys(), row_p, align='center', width=0.05)
|
|
plt.ylim(0, 1.0)
|
|
plt.savefig(''.join([output, '_all.png']), dpi=300)
|
|
|
|
|
|
def plot_rc_piechart(data, output):
|
|
from matplotlib import cm
|
|
|
|
tables = 0
|
|
rows, cols = [], []
|
|
for page_number in data.keys():
|
|
page = data[page_number]
|
|
for table_number in page.keys():
|
|
table = page[table_number]
|
|
tables += 1
|
|
rows.append(table['nrows'])
|
|
cols.append(table['ncols'])
|
|
|
|
r = Counter(rows)
|
|
c = Counter(cols)
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
|
|
ax1 = plt.subplot(211, aspect='equal')
|
|
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
|
|
ax1.set_title('row distribution across tables')
|
|
|
|
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
|
|
ax2 = plt.subplot(212, aspect='equal')
|
|
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
|
|
ax2.set_title('column distribution across tables')
|
|
plt.savefig(''.join([output, '_rc.png']), dpi=300)
|
|
|
|
|
|
def print_stats(data, p_time):
|
|
from operator import itemgetter
|
|
from itertools import groupby
|
|
|
|
scores = []
|
|
continuous_tables = []
|
|
total_tables = 0
|
|
for page_number in data.keys():
|
|
page = data[page_number]
|
|
total_tables += len(page.keys())
|
|
for table_number in page.keys():
|
|
table = page[table_number]
|
|
continuous_tables.append((page_number, table_number, table['ncols']))
|
|
scores.append(table['score'])
|
|
avg_score = np.mean(scores)
|
|
|
|
ct_pages = []
|
|
header_string = ""
|
|
if len(continuous_tables) > 1:
|
|
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
|
|
for k, g in groupby(tables, key=itemgetter(2)):
|
|
g = list(g)
|
|
tables_same_ncols = set([int(t[0][5:]) for t in g])
|
|
tables_same_ncols = sorted(list(tables_same_ncols))
|
|
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
|
|
G = list(G)
|
|
ct_pages.append((str(G[0][1]), str(G[-1][1])))
|
|
|
|
result_headers = []
|
|
for ct in ct_pages:
|
|
header_idx = {}
|
|
possible_headers = []
|
|
ncols = 0
|
|
for page_number in range(int(ct[0]), int(ct[1]) + 1):
|
|
page = data['page-{0}'.format(page_number)]
|
|
for table_number in page.keys():
|
|
table = page[table_number]
|
|
ncols = table['ncols']
|
|
for i, row in enumerate(table['data']):
|
|
try:
|
|
header_idx[tuple(row)].append(i)
|
|
except KeyError:
|
|
header_idx[tuple(row)] = [i]
|
|
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
|
|
possible_headers = filter(lambda z: len(z) == ncols,
|
|
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
|
|
modes = []
|
|
for p_h in possible_headers:
|
|
try:
|
|
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
|
|
except KeyError:
|
|
pass
|
|
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
|
|
result_headers.append(header)
|
|
|
|
header_string = "Multi-page table headers*:\n"
|
|
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
|
|
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
|
|
ct_pages, result_headers)])])
|
|
|
|
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
|
|
p_time / float(len(data))) if len(data) not in [0, 1] else ""
|
|
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
|
|
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else ""
|
|
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
|
|
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
|
|
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
|
|
|
|
print(''.join([stat_string, header_string]))
|
|
|
|
|
|
def convert_to_html(table):
|
|
html = ''
|
|
html = ''.join([html, '<table border="1">\n'])
|
|
for row in table:
|
|
html = ''.join([html, ' <tr>\n'])
|
|
for data in row:
|
|
html = ''.join([html, ' <td>', data, '</td>\n'])
|
|
html = ''.join([html, ' </tr>\n'])
|
|
html = ''.join([html, '</table>\n'])
|
|
return html
|
|
|
|
|
|
def write_to_disk(data, f='csv', output=None, filename=None):
|
|
# raise something if filename and/or output are None
|
|
fname = os.path.basename(filename)
|
|
froot, __ = os.path.splitext(fname)
|
|
if f in ['csv', 'tsv']:
|
|
import csv
|
|
delimiter = ',' if f == 'csv' else '\t'
|
|
for page_number in sorted(data.keys()):
|
|
for table_number in sorted(data[page_number].keys()):
|
|
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
|
|
with open(os.path.join(output, dsvname), 'w') as outfile:
|
|
writer = csv.writer(
|
|
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
|
for row in data[page_number][table_number]['data']:
|
|
writer.writerow(row)
|
|
elif f == 'html':
|
|
htmlname = '{0}.html'.format(froot)
|
|
for page_number in sorted(data.keys()):
|
|
for table_number in sorted(data[page_number].keys()):
|
|
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
|
htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
|
|
elif f == 'json':
|
|
import json
|
|
with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
|
|
as jsonfile:
|
|
json.dump(data, jsonfile)
|
|
elif f == 'xlsx':
|
|
try:
|
|
from pyexcel_xlsx import save_data
|
|
from collections import OrderedDict
|
|
xlsx_data = OrderedDict()
|
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
|
|
sheet_name = ''.join([page_number, '_', table_number])
|
|
xlsx_data.update({sheet_name:
|
|
[row for row in data[page_number][table_number]['data']]})
|
|
save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
|
|
except ImportError:
|
|
print("link to install docs")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
start_time = time.time()
|
|
|
|
args = docopt(doc, version='0.1', options_first=True)
|
|
argv = [args['<method>']] + args['<args>']
|
|
if args['<method>'] == 'lattice':
|
|
args.update(docopt(lattice_doc, argv=argv))
|
|
elif args['<method>'] == 'stream':
|
|
args.update(docopt(stream_doc, argv=argv))
|
|
|
|
vprint = print if args['--verbose'] else lambda *a, **k: None
|
|
filename = args['<file>']
|
|
filedir = os.path.dirname(args['<file>'])
|
|
logname, __ = os.path.splitext(filename)
|
|
logname = ''.join([logname, '.log'])
|
|
scorename, __ = os.path.splitext(filename)
|
|
scorename = ''.join([scorename, '_info.csv'])
|
|
pngname, __ = os.path.splitext(filename)
|
|
|
|
if args['--log']:
|
|
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
|
|
if args['--output']:
|
|
logname = os.path.join(args['--output'], os.path.basename(logname))
|
|
logging.basicConfig(
|
|
filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
|
|
|
|
p = []
|
|
if args['--pages'] == '1':
|
|
p.append({'start': 1, 'end': 1})
|
|
else:
|
|
if args['--pages'] == 'all':
|
|
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
|
p.append({'start': 1, 'end': infile.getNumPages()})
|
|
else:
|
|
for r in args['--pages'].split(','):
|
|
if '-' in r:
|
|
a, b = r.split('-')
|
|
p.append({'start': int(a), 'end': int(b)})
|
|
else:
|
|
p.append({'start': int(r), 'end': int(r)})
|
|
|
|
margins = (float(args['--cmargin']), float(args['--lmargin']),
|
|
float(args['--wmargin']))
|
|
if args['<method>'] == 'lattice':
|
|
try:
|
|
tarea = args['--tarea'] if args['--tarea'] else None
|
|
fill = args['--fill'] if args['--fill'] else None
|
|
mtol = [int(m) for m in args['--mtol']]
|
|
manager = Pdf(Lattice(table_area=tarea, fill=fill,
|
|
mtol=mtol, scale=int(args['--scale']),
|
|
invert=args['--invert'], margins=margins,
|
|
debug=args['--debug']),
|
|
filename,
|
|
pagenos=p,
|
|
parallel=args['--parallel'],
|
|
clean=True)
|
|
data = manager.extract()
|
|
|
|
processing_time = time.time() - start_time
|
|
vprint("Finished processing in", processing_time, "seconds")
|
|
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
|
|
|
if args['--plot']:
|
|
if args['--output']:
|
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
|
plot_type = args['--plot'].split(',')
|
|
if 'page' in plot_type:
|
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
page = data[page_number]
|
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
table = page[table_number]
|
|
plot_table_barchart(table['r_nempty_cells'],
|
|
table['c_nempty_cells'],
|
|
table['empty_p'],
|
|
page_number,
|
|
table_number)
|
|
|
|
if 'all' in plot_type:
|
|
plot_all_barchart(data, pngname)
|
|
|
|
if 'rc' in plot_type:
|
|
plot_rc_piechart(data, pngname)
|
|
|
|
if args['--print-stats']:
|
|
print_stats(data, processing_time)
|
|
|
|
if args['--save-stats']:
|
|
if args['--output']:
|
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
|
with open(scorename, 'w') as score_file:
|
|
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
page = data[page_number]
|
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
table = page[table_number]
|
|
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
|
''.join([page_number, '_', table_number]),
|
|
table['nrows'],
|
|
table['ncols'],
|
|
table['empty_p'],
|
|
table['line_p'],
|
|
table['text_p'],
|
|
table['score']))
|
|
if args['--debug']:
|
|
manager.debug_plot()
|
|
except Exception as e:
|
|
logging.exception(e.message, exc_info=True)
|
|
sys.exit()
|
|
elif args['<method>'] == 'stream':
|
|
try:
|
|
tarea = args['--tarea'] if args['--tarea'] else None
|
|
columns = args['--columns'] if args['--columns'] else None
|
|
if args['--ncols'] and args['--ncols'] != ['-1']:
|
|
ncolumns = [int(nc) for nc in args['--ncols']]
|
|
else:
|
|
ncolumns = None
|
|
ytol = [int(y) for y in args['--ytol']]
|
|
mtol = [int(m) for m in args['--mtol']]
|
|
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
|
ncolumns=ncolumns, ytol=ytol, mtol=mtol,
|
|
margins=margins, debug=args['--debug']),
|
|
filename,
|
|
pagenos=p,
|
|
parallel=args['--parallel'],
|
|
clean=True)
|
|
data = manager.extract()
|
|
|
|
processing_time = time.time() - start_time
|
|
vprint("Finished processing in", processing_time, "seconds")
|
|
logging.info("Finished processing in " + str(processing_time) + " seconds")
|
|
|
|
if args['--plot']:
|
|
if args['--output']:
|
|
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
|
plot_type = args['--plot'].split(',')
|
|
if 'page' in plot_type:
|
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
page = data[page_number]
|
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
table = page[table_number]
|
|
plot_table_barchart(table['r_nempty_cells'],
|
|
table['c_nempty_cells'],
|
|
table['empty_p'],
|
|
page_number,
|
|
table_number)
|
|
|
|
if 'all' in plot_type:
|
|
plot_all_barchart(data, pngname)
|
|
|
|
if 'rc' in plot_type:
|
|
plot_rc_piechart(data, pngname)
|
|
|
|
if args['--print-stats']:
|
|
print_stats(data, processing_time)
|
|
|
|
if args['--save-stats']:
|
|
if args['--output']:
|
|
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
|
with open(scorename, 'w') as score_file:
|
|
score_file.write('table,nrows,ncols,empty_p,,score\n')
|
|
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
|
page = data[page_number]
|
|
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
|
table = page[table_number]
|
|
score_file.write('{0},{1},{2},{3},{4}\n'.format(
|
|
''.join([page_number, '_', table_number]),
|
|
table['nrows'],
|
|
table['ncols'],
|
|
table['empty_p'],
|
|
table['score']))
|
|
|
|
if args['--debug']:
|
|
manager.debug_plot()
|
|
except Exception as e:
|
|
logging.exception(e.message, exc_info=True)
|
|
sys.exit()
|
|
|
|
if args['--debug']:
|
|
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
|
else:
|
|
output = filedir if args['--output'] is None else args['--output']
|
|
write_to_disk(data, f=args['--format'],
|
|
output=output, filename=filename)
|