diff --git a/tools/camelot b/tools/camelot deleted file mode 100755 index 61a687f..0000000 --- a/tools/camelot +++ /dev/null @@ -1,508 +0,0 @@ -#!/usr/bin/env python2 -from __future__ import print_function -import os -import csv -import sys -import glob -import time -import zipfile -import warnings -import cStringIO - -import numpy as np -from docopt import docopt -from collections import Counter -import matplotlib.pyplot as plt -from PyPDF2 import PdfFileReader - -from camelot.pdf import Pdf -from camelot.lattice import Lattice -from camelot.stream import Stream -from camelot import utils - - -doc = """ -Camelot: PDF parsing made simpler! - -usage: - camelot [options] [...] - -options: - -h, --help Show this screen. - -v, --version Show version. - -p, --pages Comma-separated list of page numbers. - Example: -p 1,3-6,10 [default: 1] - -P, --parallel Parallelize the parsing process. - -f, --format Output format. (csv,tsv,zip,html,json,xlsx) [default: csv] - -l, --log Log to file. - -o, --output Output directory. - -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 1.0] - -L, --lmargin Line margin. Lines closer than lmargin are - grouped together to form a textbox. [default: 0.5] - -W, --wmargin Word margin. Insert blank spaces between chars - if distance between words is greater than word - margin. [default: 0.1] - -J, --split_text Split text lines if they span across multiple cells. - -K, --flag_size Flag substring if its size differs from the whole string. - Useful for super and subscripts. - -X, --print-stats List stats on the parsing process. - -Y, --save-stats Save stats to a file. - -Z, --plot Plot distributions. (page,all,rc) - -camelot methods: - lattice Looks for lines between data. - stream Looks for spaces between data. - -See 'camelot -h' for more information on a specific method. -""" - -lattice_doc = """ -Lattice method looks for lines between text to form a table. - -usage: - camelot lattice [-t ...] [-F ...] [-m ...] - [-j ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -F, --fill Fill data in horizontal and/or vertical spanning - cells. Example: -F h, -F v, -F hv - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -j, --jtol Tolerance to account for when matching line endings - with intersections. [default: 2] - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -I, --iterations Number of iterations for dilation. [default: 0] - -i, --invert Invert pdf image to make sure that lines are - in foreground. - -T, --shift_text Specify where the text in a spanning cell - should flow, order-sensitive. [default: lt] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table -""" - -stream_doc = """ -Stream method looks for whitespaces between text to form a table. - -usage: - camelot stream [-t ...] [-c ...] [-m ...] - [-y ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -c, --columns Comma-separated list of column x-coordinates. - Example: -c 10.1,20.2,30.3 - -m, --mtol Tolerance to account for when merging columns - together. [default: 0] - -y, --ytol Tolerance to account for when grouping rows - together. [default: 2] - -d, --debug Debug by visualizing textboxes. -""" - - -def plot_table_barchart(r, c, p, pno, tno): - row_idx = [i + 1 for i, row in enumerate(r)] - col_idx = [i + 1 for i, col in enumerate(c)] - r_index = np.arange(len(r)) - c_index = np.arange(len(c)) - width = 0.7 - - plt.figure(figsize=(8, 6)) - plt.subplot(2, 1, 1) - plt.title('Percentage of empty cells in table: {0:.2f}'.format(p)) - plt.xlabel('row index') - plt.ylabel('number of non-empty cells in row') - plt.bar(r_index, r) - plt.xticks(r_index + width * 0.5, row_idx) - plt.ylim(0, len(c)) - - plt.subplot(2, 1, 2) - plt.xlabel('column index') - plt.ylabel('number of non-empty cells in column') - plt.bar(c_index, c) - plt.xticks(c_index + width * 0.5, col_idx) - plt.ylim(0, len(r)) - plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300) - - -def plot_all_barchart(data, output): - r_empty_cells = [] - for page_number in data.keys(): - page = data[page_number] - for table_number in page.keys(): - table = page[table_number] - r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']]) - c = Counter(r_empty_cells) - if 0.0 not in c: - c.update({0.0: 0}) - if 1.0 not in c: - c.update({1.0: 0}) - - plt.figure(figsize=(8, 6)) - plt.xlabel('percentage of non-empty cells in a row') - plt.ylabel('percentage of rows processed') - row_p = [count / float(sum(c.values())) for count in c.values()] - plt.bar(c.keys(), row_p, align='center', width=0.05) - plt.ylim(0, 1.0) - plt.savefig(''.join([output, '_all.png']), dpi=300) - - -def plot_rc_piechart(data, output): - from matplotlib import cm - - tables = 0 - rows, cols = [], [] - for page_number in data.keys(): - page = data[page_number] - for table_number in page.keys(): - table = page[table_number] - tables += 1 - rows.append(table['nrows']) - cols.append(table['ncols']) - - r = Counter(rows) - c = Counter(cols) - - plt.figure(figsize=(8, 6)) - cs1 = cm.Set1(np.arange(len(r)) / float(len(r))) - ax1 = plt.subplot(211, aspect='equal') - ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90) - ax1.set_title('row distribution across tables') - - cs2 = cm.Set1(np.arange(len(c)) / float(len(c))) - ax2 = plt.subplot(212, aspect='equal') - ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90) - ax2.set_title('column distribution across tables') - plt.savefig(''.join([output, '_rc.png']), dpi=300) - - -def print_stats(data, p_time): - from operator import itemgetter - from itertools import groupby - - scores = [] - continuous_tables = [] - total_tables = 0 - for page_number in data.keys(): - page = data[page_number] - total_tables += len(page.keys()) - for table_number in page.keys(): - table = page[table_number] - continuous_tables.append((page_number, table_number, table['ncols'])) - scores.append(table['score']) - avg_score = np.mean(scores) - - ct_pages = [] - header_string = "" - if len(continuous_tables) > 1: - tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:]))) - for k, g in groupby(tables, key=itemgetter(2)): - g = list(g) - tables_same_ncols = set([int(t[0][5:]) for t in g]) - tables_same_ncols = sorted(list(tables_same_ncols)) - for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x): - G = list(G) - ct_pages.append((str(G[0][1]), str(G[-1][1]))) - - result_headers = [] - for ct in ct_pages: - header_idx = {} - possible_headers = [] - ncols = 0 - for page_number in range(int(ct[0]), int(ct[1]) + 1): - page = data['page-{0}'.format(page_number)] - for table_number in page.keys(): - table = page[table_number] - ncols = table['ncols'] - for i, row in enumerate(table['data']): - try: - header_idx[tuple(row)].append(i) - except KeyError: - header_idx[tuple(row)] = [i] - possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10] - possible_headers = filter(lambda z: len(z) == ncols, - [filter(lambda x: x != '', p_h) for p_h in possible_headers]) - modes = [] - for p_h in possible_headers: - try: - modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count))) - except KeyError: - pass - header = modes[modes.index(min(modes, key=lambda x: x[1]))][0] - result_headers.append(header) - - header_string = "Multi-page table headers*:\n" - header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format( - '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip( - ct_pages, result_headers)])]) - - avg_time = "Time taken per page: {0:.2f} seconds\n".format( - p_time / float(len(data))) if len(data) not in [0, 1] else "" - equal_ncols = "\nMulti-page tables on*: {0}\n".format( - ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else "" - stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols] - stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n" - "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats)) - - print(''.join([stat_string, header_string])) - - -def convert_to_html(table): - html = '' - html = ''.join([html, '\n']) - for row in table: - html = ''.join([html, ' \n']) - for data in row: - html = ''.join([html, ' \n']) - html = ''.join([html, ' \n']) - html = ''.join([html, '
', data, '
\n']) - return html - - -def write_to_disk(data, f='csv', output=None, filename=None): - # raise something if filename and/or output are None - fname = os.path.basename(filename) - froot, __ = os.path.splitext(fname) - if f in ['csv', 'tsv']: - delimiter = ',' if f == 'csv' else '\t' - for page_number in sorted(data.keys()): - if data[page_number] is not None: - for table_number in sorted(data[page_number].keys()): - dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f) - with open(os.path.join(output, dsvname), 'w') as outfile: - writer = csv.writer( - outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) - for row in data[page_number][table_number]['data']: - writer.writerow(row) - elif f == 'zip': - csv_zip = os.path.join(output, '{0}.zip'.format(froot)) - with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \ - as zfile: - for page_number in sorted(data.keys()): - if data[page_number] is not None: - for table_number in sorted(data[page_number].keys()): - csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv') - outfile = cStringIO.StringIO() - writer = csv.writer( - outfile, delimiter=',', quoting=csv.QUOTE_ALL) - for row in data[page_number][table_number]['data']: - writer.writerow(row) - zfile.writestr(csvname, outfile.getvalue()) - outfile.close() - elif f == 'html': - htmlname = '{0}.html'.format(froot) - for page_number in sorted(data.keys()): - for table_number in sorted(data[page_number].keys()): - with open(os.path.join(output, htmlname), 'a') as htmlfile: - htmlfile.write(convert_to_html(data[page_number][table_number]['data'])) - elif f == 'json': - import json - with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \ - as jsonfile: - json.dump(data, jsonfile) - elif f == 'xlsx': - try: - from pyexcel_xlsx import save_data - from collections import OrderedDict - xlsx_data = OrderedDict() - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])): - sheet_name = ''.join([page_number, '_', table_number]) - xlsx_data.update({sheet_name: - [row for row in data[page_number][table_number]['data']]}) - save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data) - except ImportError: - print("link to install docs") - - -if __name__ == '__main__': - start_time = time.time() - - args = docopt(doc, version='0.1', options_first=True) - argv = [args['']] + args[''] - if args[''] == 'lattice': - args.update(docopt(lattice_doc, argv=argv)) - elif args[''] == 'stream': - args.update(docopt(stream_doc, argv=argv)) - - filename = args[''] - filedir = os.path.dirname(args['']) - logname, __ = os.path.splitext(filename) - logname = ''.join([logname, '.log']) - scorename, __ = os.path.splitext(filename) - scorename = ''.join([scorename, '_info.csv']) - pngname, __ = os.path.splitext(filename) - - FORMAT = '%(asctime)s - %(levelname)s - %(message)s' - if args['--log'] is not None: - logger = utils.setup_logging(args['--log']) - else: - logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log')) - - p = [] - if args['--pages'] == '1': - p.append({'start': 1, 'end': 1}) - else: - infile = PdfFileReader(open(filename, 'rb'), strict=False) - if args['--pages'] == 'all': - p.append({'start': 1, 'end': infile.getNumPages()}) - else: - for r in args['--pages'].split(','): - if '-' in r: - a, b = r.split('-') - if b == 'end': - b = infile.getNumPages() - p.append({'start': int(a), 'end': int(b)}) - else: - p.append({'start': int(r), 'end': int(r)}) - - logger.info('Applying {0} method on {1}'.format(args[''], - os.path.basename(filename))) - margins = (float(args['--cmargin']), float(args['--lmargin']), - float(args['--wmargin'])) - if args[''] == 'lattice': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'fill': args['--fill'] if args['--fill'] else None, - 'mtol': [int(m) for m in args['--mtol']], - 'jtol': [int(j) for j in args['--jtol']], - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'scale': int(args['--scale']), - 'iterations': int(args['--iterations']), - 'invert': args['--invert'], - 'margins': margins, - 'split_text': args['--split_text'], - 'flag_size': args['--flag_size'], - 'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'], - 'debug': args['--debug'] - } - manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - elif args[''] == 'stream': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'columns': args['--columns'] if args['--columns'] else None, - 'ytol': [int(y) for y in args['--ytol']], - 'mtol': [int(m) for m in args['--mtol']], - 'margins': margins, - 'split_text': args['--split_text'], - 'flag_size': args['--flag_size'], - 'debug': args['--debug'] - } - manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['score'])) - - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - - if args.get('--debug') is not None and args['--debug']: - print("See 'camelot -h' for various parameters you can tweak.") - else: - output = filedir if args['--output'] is None else args['--output'] - write_to_disk(data, f=args['--format'], - output=output, filename=filename)