Remove old CLI code
parent
329d5d4dc6
commit
9d2708171b
508
tools/camelot
508
tools/camelot
|
|
@ -1,508 +0,0 @@
|
|||
#!/usr/bin/env python2
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import csv
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import zipfile
|
||||
import warnings
|
||||
import cStringIO
|
||||
|
||||
import numpy as np
|
||||
from docopt import docopt
|
||||
from collections import Counter
|
||||
import matplotlib.pyplot as plt
|
||||
from PyPDF2 import PdfFileReader
|
||||
|
||||
from camelot.pdf import Pdf
|
||||
from camelot.lattice import Lattice
|
||||
from camelot.stream import Stream
|
||||
from camelot import utils
|
||||
|
||||
|
||||
doc = """
|
||||
Camelot: PDF parsing made simpler!
|
||||
|
||||
usage:
|
||||
camelot [options] <method> [<args>...]
|
||||
|
||||
options:
|
||||
-h, --help Show this screen.
|
||||
-v, --version Show version.
|
||||
-p, --pages <pageno> Comma-separated list of page numbers.
|
||||
Example: -p 1,3-6,10 [default: 1]
|
||||
-P, --parallel Parallelize the parsing process.
|
||||
-f, --format <format> Output format. (csv,tsv,zip,html,json,xlsx) [default: csv]
|
||||
-l, --log <logfile> Log to file.
|
||||
-o, --output <directory> Output directory.
|
||||
-M, --cmargin <cmargin> Char margin. Chars closer than cmargin are
|
||||
grouped together to form a word. [default: 1.0]
|
||||
-L, --lmargin <lmargin> Line margin. Lines closer than lmargin are
|
||||
grouped together to form a textbox. [default: 0.5]
|
||||
-W, --wmargin <wmargin> Word margin. Insert blank spaces between chars
|
||||
if distance between words is greater than word
|
||||
margin. [default: 0.1]
|
||||
-J, --split_text Split text lines if they span across multiple cells.
|
||||
-K, --flag_size Flag substring if its size differs from the whole string.
|
||||
Useful for super and subscripts.
|
||||
-X, --print-stats List stats on the parsing process.
|
||||
-Y, --save-stats Save stats to a file.
|
||||
-Z, --plot <dist> Plot distributions. (page,all,rc)
|
||||
|
||||
camelot methods:
|
||||
lattice Looks for lines between data.
|
||||
stream Looks for spaces between data.
|
||||
|
||||
See 'camelot <method> -h' for more information on a specific method.
|
||||
"""
|
||||
|
||||
lattice_doc = """
|
||||
Lattice method looks for lines between text to form a table.
|
||||
|
||||
usage:
|
||||
camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
|
||||
[-j <jtol>...] [options] [--] <file>
|
||||
|
||||
options:
|
||||
-t, --tarea <tarea> Specific table areas to analyze.
|
||||
-F, --fill <fill> Fill data in horizontal and/or vertical spanning
|
||||
cells. Example: -F h, -F v, -F hv
|
||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||
which are very close. [default: 2]
|
||||
-j, --jtol <jtol> Tolerance to account for when matching line endings
|
||||
with intersections. [default: 2]
|
||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
-I, --iterations <iterations> Number of iterations for dilation. [default: 0]
|
||||
-i, --invert Invert pdf image to make sure that lines are
|
||||
in foreground.
|
||||
-T, --shift_text <shift_text> Specify where the text in a spanning cell
|
||||
should flow, order-sensitive. [default: lt]
|
||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||
(contour,line,joint,table) Example: -d table
|
||||
"""
|
||||
|
||||
stream_doc = """
|
||||
Stream method looks for whitespaces between text to form a table.
|
||||
|
||||
usage:
|
||||
camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
|
||||
[-y <ytol>...] [options] [--] <file>
|
||||
|
||||
options:
|
||||
-t, --tarea <tarea> Specific table areas to analyze.
|
||||
-c, --columns <columns> Comma-separated list of column x-coordinates.
|
||||
Example: -c 10.1,20.2,30.3
|
||||
-m, --mtol <mtol> Tolerance to account for when merging columns
|
||||
together. [default: 0]
|
||||
-y, --ytol <ytol> Tolerance to account for when grouping rows
|
||||
together. [default: 2]
|
||||
-d, --debug Debug by visualizing textboxes.
|
||||
"""
|
||||
|
||||
|
||||
def plot_table_barchart(r, c, p, pno, tno):
|
||||
row_idx = [i + 1 for i, row in enumerate(r)]
|
||||
col_idx = [i + 1 for i, col in enumerate(c)]
|
||||
r_index = np.arange(len(r))
|
||||
c_index = np.arange(len(c))
|
||||
width = 0.7
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.subplot(2, 1, 1)
|
||||
plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
|
||||
plt.xlabel('row index')
|
||||
plt.ylabel('number of non-empty cells in row')
|
||||
plt.bar(r_index, r)
|
||||
plt.xticks(r_index + width * 0.5, row_idx)
|
||||
plt.ylim(0, len(c))
|
||||
|
||||
plt.subplot(2, 1, 2)
|
||||
plt.xlabel('column index')
|
||||
plt.ylabel('number of non-empty cells in column')
|
||||
plt.bar(c_index, c)
|
||||
plt.xticks(c_index + width * 0.5, col_idx)
|
||||
plt.ylim(0, len(r))
|
||||
plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
|
||||
|
||||
|
||||
def plot_all_barchart(data, output):
|
||||
r_empty_cells = []
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
|
||||
c = Counter(r_empty_cells)
|
||||
if 0.0 not in c:
|
||||
c.update({0.0: 0})
|
||||
if 1.0 not in c:
|
||||
c.update({1.0: 0})
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.xlabel('percentage of non-empty cells in a row')
|
||||
plt.ylabel('percentage of rows processed')
|
||||
row_p = [count / float(sum(c.values())) for count in c.values()]
|
||||
plt.bar(c.keys(), row_p, align='center', width=0.05)
|
||||
plt.ylim(0, 1.0)
|
||||
plt.savefig(''.join([output, '_all.png']), dpi=300)
|
||||
|
||||
|
||||
def plot_rc_piechart(data, output):
|
||||
from matplotlib import cm
|
||||
|
||||
tables = 0
|
||||
rows, cols = [], []
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
tables += 1
|
||||
rows.append(table['nrows'])
|
||||
cols.append(table['ncols'])
|
||||
|
||||
r = Counter(rows)
|
||||
c = Counter(cols)
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
|
||||
ax1 = plt.subplot(211, aspect='equal')
|
||||
ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
|
||||
ax1.set_title('row distribution across tables')
|
||||
|
||||
cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
|
||||
ax2 = plt.subplot(212, aspect='equal')
|
||||
ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
|
||||
ax2.set_title('column distribution across tables')
|
||||
plt.savefig(''.join([output, '_rc.png']), dpi=300)
|
||||
|
||||
|
||||
def print_stats(data, p_time):
|
||||
from operator import itemgetter
|
||||
from itertools import groupby
|
||||
|
||||
scores = []
|
||||
continuous_tables = []
|
||||
total_tables = 0
|
||||
for page_number in data.keys():
|
||||
page = data[page_number]
|
||||
total_tables += len(page.keys())
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
continuous_tables.append((page_number, table_number, table['ncols']))
|
||||
scores.append(table['score'])
|
||||
avg_score = np.mean(scores)
|
||||
|
||||
ct_pages = []
|
||||
header_string = ""
|
||||
if len(continuous_tables) > 1:
|
||||
tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
|
||||
for k, g in groupby(tables, key=itemgetter(2)):
|
||||
g = list(g)
|
||||
tables_same_ncols = set([int(t[0][5:]) for t in g])
|
||||
tables_same_ncols = sorted(list(tables_same_ncols))
|
||||
for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
|
||||
G = list(G)
|
||||
ct_pages.append((str(G[0][1]), str(G[-1][1])))
|
||||
|
||||
result_headers = []
|
||||
for ct in ct_pages:
|
||||
header_idx = {}
|
||||
possible_headers = []
|
||||
ncols = 0
|
||||
for page_number in range(int(ct[0]), int(ct[1]) + 1):
|
||||
page = data['page-{0}'.format(page_number)]
|
||||
for table_number in page.keys():
|
||||
table = page[table_number]
|
||||
ncols = table['ncols']
|
||||
for i, row in enumerate(table['data']):
|
||||
try:
|
||||
header_idx[tuple(row)].append(i)
|
||||
except KeyError:
|
||||
header_idx[tuple(row)] = [i]
|
||||
possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
|
||||
possible_headers = filter(lambda z: len(z) == ncols,
|
||||
[filter(lambda x: x != '', p_h) for p_h in possible_headers])
|
||||
modes = []
|
||||
for p_h in possible_headers:
|
||||
try:
|
||||
modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
|
||||
except KeyError:
|
||||
pass
|
||||
header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
|
||||
result_headers.append(header)
|
||||
|
||||
header_string = "Multi-page table headers*:\n"
|
||||
header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
|
||||
'-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
|
||||
ct_pages, result_headers)])])
|
||||
|
||||
avg_time = "Time taken per page: {0:.2f} seconds\n".format(
|
||||
p_time / float(len(data))) if len(data) not in [0, 1] else ""
|
||||
equal_ncols = "\nMulti-page tables on*: {0}\n".format(
|
||||
', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else ""
|
||||
stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
|
||||
stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
|
||||
"{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
|
||||
|
||||
print(''.join([stat_string, header_string]))
|
||||
|
||||
|
||||
def convert_to_html(table):
|
||||
html = ''
|
||||
html = ''.join([html, '<table border="1">\n'])
|
||||
for row in table:
|
||||
html = ''.join([html, ' <tr>\n'])
|
||||
for data in row:
|
||||
html = ''.join([html, ' <td>', data, '</td>\n'])
|
||||
html = ''.join([html, ' </tr>\n'])
|
||||
html = ''.join([html, '</table>\n'])
|
||||
return html
|
||||
|
||||
|
||||
def write_to_disk(data, f='csv', output=None, filename=None):
|
||||
# raise something if filename and/or output are None
|
||||
fname = os.path.basename(filename)
|
||||
froot, __ = os.path.splitext(fname)
|
||||
if f in ['csv', 'tsv']:
|
||||
delimiter = ',' if f == 'csv' else '\t'
|
||||
for page_number in sorted(data.keys()):
|
||||
if data[page_number] is not None:
|
||||
for table_number in sorted(data[page_number].keys()):
|
||||
dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
|
||||
with open(os.path.join(output, dsvname), 'w') as outfile:
|
||||
writer = csv.writer(
|
||||
outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
|
||||
for row in data[page_number][table_number]['data']:
|
||||
writer.writerow(row)
|
||||
elif f == 'zip':
|
||||
csv_zip = os.path.join(output, '{0}.zip'.format(froot))
|
||||
with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \
|
||||
as zfile:
|
||||
for page_number in sorted(data.keys()):
|
||||
if data[page_number] is not None:
|
||||
for table_number in sorted(data[page_number].keys()):
|
||||
csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv')
|
||||
outfile = cStringIO.StringIO()
|
||||
writer = csv.writer(
|
||||
outfile, delimiter=',', quoting=csv.QUOTE_ALL)
|
||||
for row in data[page_number][table_number]['data']:
|
||||
writer.writerow(row)
|
||||
zfile.writestr(csvname, outfile.getvalue())
|
||||
outfile.close()
|
||||
elif f == 'html':
|
||||
htmlname = '{0}.html'.format(froot)
|
||||
for page_number in sorted(data.keys()):
|
||||
for table_number in sorted(data[page_number].keys()):
|
||||
with open(os.path.join(output, htmlname), 'a') as htmlfile:
|
||||
htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
|
||||
elif f == 'json':
|
||||
import json
|
||||
with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
|
||||
as jsonfile:
|
||||
json.dump(data, jsonfile)
|
||||
elif f == 'xlsx':
|
||||
try:
|
||||
from pyexcel_xlsx import save_data
|
||||
from collections import OrderedDict
|
||||
xlsx_data = OrderedDict()
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
|
||||
sheet_name = ''.join([page_number, '_', table_number])
|
||||
xlsx_data.update({sheet_name:
|
||||
[row for row in data[page_number][table_number]['data']]})
|
||||
save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
|
||||
except ImportError:
|
||||
print("link to install docs")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
|
||||
args = docopt(doc, version='0.1', options_first=True)
|
||||
argv = [args['<method>']] + args['<args>']
|
||||
if args['<method>'] == 'lattice':
|
||||
args.update(docopt(lattice_doc, argv=argv))
|
||||
elif args['<method>'] == 'stream':
|
||||
args.update(docopt(stream_doc, argv=argv))
|
||||
|
||||
filename = args['<file>']
|
||||
filedir = os.path.dirname(args['<file>'])
|
||||
logname, __ = os.path.splitext(filename)
|
||||
logname = ''.join([logname, '.log'])
|
||||
scorename, __ = os.path.splitext(filename)
|
||||
scorename = ''.join([scorename, '_info.csv'])
|
||||
pngname, __ = os.path.splitext(filename)
|
||||
|
||||
FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
|
||||
if args['--log'] is not None:
|
||||
logger = utils.setup_logging(args['--log'])
|
||||
else:
|
||||
logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))
|
||||
|
||||
p = []
|
||||
if args['--pages'] == '1':
|
||||
p.append({'start': 1, 'end': 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||
if args['--pages'] == 'all':
|
||||
p.append({'start': 1, 'end': infile.getNumPages()})
|
||||
else:
|
||||
for r in args['--pages'].split(','):
|
||||
if '-' in r:
|
||||
a, b = r.split('-')
|
||||
if b == 'end':
|
||||
b = infile.getNumPages()
|
||||
p.append({'start': int(a), 'end': int(b)})
|
||||
else:
|
||||
p.append({'start': int(r), 'end': int(r)})
|
||||
|
||||
logger.info('Applying {0} method on {1}'.format(args['<method>'],
|
||||
os.path.basename(filename)))
|
||||
margins = (float(args['--cmargin']), float(args['--lmargin']),
|
||||
float(args['--wmargin']))
|
||||
if args['<method>'] == 'lattice':
|
||||
try:
|
||||
kwargs = {
|
||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||
'fill': args['--fill'] if args['--fill'] else None,
|
||||
'mtol': [int(m) for m in args['--mtol']],
|
||||
'jtol': [int(j) for j in args['--jtol']],
|
||||
'blocksize': int(args['--blocksize']),
|
||||
'threshold_constant': float(args['--constant']),
|
||||
'scale': int(args['--scale']),
|
||||
'iterations': int(args['--iterations']),
|
||||
'invert': args['--invert'],
|
||||
'margins': margins,
|
||||
'split_text': args['--split_text'],
|
||||
'flag_size': args['--flag_size'],
|
||||
'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
|
||||
'debug': args['--debug']
|
||||
}
|
||||
manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
|
||||
parallel=args['--parallel'])
|
||||
data = manager.extract()
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||
plot_type = args['--plot'].split(',')
|
||||
if 'page' in plot_type:
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
plot_table_barchart(table['r_nempty_cells'],
|
||||
table['c_nempty_cells'],
|
||||
table['empty_p'],
|
||||
page_number,
|
||||
table_number)
|
||||
|
||||
if 'all' in plot_type:
|
||||
plot_all_barchart(data, pngname)
|
||||
|
||||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--print-stats']:
|
||||
print_stats(data, processing_time)
|
||||
|
||||
if args['--save-stats']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
|
||||
''.join([page_number, '_', table_number]),
|
||||
table['nrows'],
|
||||
table['ncols'],
|
||||
table['empty_p'],
|
||||
table['line_p'],
|
||||
table['text_p'],
|
||||
table['score']))
|
||||
if args['--debug']:
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logger.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
elif args['<method>'] == 'stream':
|
||||
try:
|
||||
kwargs = {
|
||||
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||
'columns': args['--columns'] if args['--columns'] else None,
|
||||
'ytol': [int(y) for y in args['--ytol']],
|
||||
'mtol': [int(m) for m in args['--mtol']],
|
||||
'margins': margins,
|
||||
'split_text': args['--split_text'],
|
||||
'flag_size': args['--flag_size'],
|
||||
'debug': args['--debug']
|
||||
}
|
||||
manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
|
||||
parallel=args['--parallel'])
|
||||
data = manager.extract()
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
logger.info("Finished processing in " + str(processing_time) + " seconds")
|
||||
|
||||
if args['--plot']:
|
||||
if args['--output']:
|
||||
pngname = os.path.join(args['--output'], os.path.basename(pngname))
|
||||
plot_type = args['--plot'].split(',')
|
||||
if 'page' in plot_type:
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
plot_table_barchart(table['r_nempty_cells'],
|
||||
table['c_nempty_cells'],
|
||||
table['empty_p'],
|
||||
page_number,
|
||||
table_number)
|
||||
|
||||
if 'all' in plot_type:
|
||||
plot_all_barchart(data, pngname)
|
||||
|
||||
if 'rc' in plot_type:
|
||||
plot_rc_piechart(data, pngname)
|
||||
|
||||
if args['--print-stats']:
|
||||
print_stats(data, processing_time)
|
||||
|
||||
if args['--save-stats']:
|
||||
if args['--output']:
|
||||
scorename = os.path.join(args['--output'], os.path.basename(scorename))
|
||||
with open(scorename, 'w') as score_file:
|
||||
score_file.write('table,nrows,ncols,empty_p,,score\n')
|
||||
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
|
||||
page = data[page_number]
|
||||
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
|
||||
table = page[table_number]
|
||||
score_file.write('{0},{1},{2},{3},{4}\n'.format(
|
||||
''.join([page_number, '_', table_number]),
|
||||
table['nrows'],
|
||||
table['ncols'],
|
||||
table['empty_p'],
|
||||
table['score']))
|
||||
|
||||
if args['--debug']:
|
||||
manager.debug_plot()
|
||||
except Exception as e:
|
||||
logger.exception(e.message, exc_info=True)
|
||||
sys.exit()
|
||||
|
||||
if args.get('--debug') is not None and args['--debug']:
|
||||
print("See 'camelot <method> -h' for various parameters you can tweak.")
|
||||
else:
|
||||
output = filedir if args['--output'] is None else args['--output']
|
||||
write_to_disk(data, f=args['--format'],
|
||||
output=output, filename=filename)
|
||||
Loading…
Reference in New Issue