#!/usr/bin/env python2 from __future__ import print_function import os import csv import sys import glob import time import zipfile import warnings import cStringIO import numpy as np from docopt import docopt from collections import Counter import matplotlib.pyplot as plt from PyPDF2 import PdfFileReader from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream from camelot.ocr import OCR from camelot import utils doc = """ Camelot: PDF parsing made simpler! usage: camelot [options] [...] options: -h, --help Show this screen. -v, --version Show version. -p, --pages Comma-separated list of page numbers. Example: -p 1,3-6,10 [default: 1] -P, --parallel Parallelize the parsing process. -f, --format Output format. (csv,tsv,zip,html,json,xlsx) [default: csv] -l, --log Log to file. -o, --output Output directory. -M, --cmargin Char margin. Chars closer than cmargin are grouped together to form a word. [default: 1.0] -L, --lmargin Line margin. Lines closer than lmargin are grouped together to form a textbox. [default: 0.5] -W, --wmargin Word margin. Insert blank spaces between chars if distance between words is greater than word margin. [default: 0.1] -J, --split_text Split text lines if they span across multiple cells. -K, --flag_size Flag substring if its size differs from the whole string. Useful for super and subscripts. -X, --print-stats List stats on the parsing process. -Y, --save-stats Save stats to a file. -Z, --plot Plot distributions. (page,all,rc) camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. ocr Looks for lines in image based pdfs. See 'camelot -h' for more information on a specific method. """ lattice_doc = """ Lattice method looks for lines between text to form a table. usage: camelot lattice [-t ...] [-F ...] [-H
...] [-m ...] [options] [--] options: -t, --tarea Specific table areas to analyze. -F, --fill Fill data in horizontal and/or vertical spanning cells. Example: -F h, -F v, -F hv -H, --header
Specify header for each table. -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] -b, --blocksize See adaptive threshold doc. [default: 15] -c, --constant See adaptive threshold doc. [default: -2] -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -i, --invert Invert pdf image to make sure that lines are in foreground. -T, --shift_text Specify where the text in a spanning cell should flow, order-sensitive. [default: lt] -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ stream_doc = """ Stream method looks for whitespaces between text to form a table. usage: camelot stream [-t ...] [-c ...] [-H
...] [-y ...] [-m ...] [options] [--] options: -t, --tarea Specific table areas to analyze. -c, --columns Comma-separated list of column x-coordinates. Example: -c 10.1,20.2,30.3 -H, --header
Specify header for each table. -y, --ytol Tolerance to account for when grouping rows together. [default: 2] -m, --mtol Tolerance to account for when merging columns together. [default: 0] -d, --debug Debug by visualizing textboxes. """ ocr_doc = """ OCR method looks for lines in image based pdfs. usage: camelot ocr [-t ] [-m ] [options] [--] options: -t, --tarea Specific table areas to analyze. -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] -b, --blocksize See adaptive threshold doc. [default: 15] -c, --constant See adaptive threshold doc. [default: -2] -D, --dpi Dots per inch, specify image quality to be used for OCR. [default: 300] -l, --lang Specify language to be used for OCR. [default: eng] -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ def plot_table_barchart(r, c, p, pno, tno): row_idx = [i + 1 for i, row in enumerate(r)] col_idx = [i + 1 for i, col in enumerate(c)] r_index = np.arange(len(r)) c_index = np.arange(len(c)) width = 0.7 plt.figure(figsize=(8, 6)) plt.subplot(2, 1, 1) plt.title('Percentage of empty cells in table: {0:.2f}'.format(p)) plt.xlabel('row index') plt.ylabel('number of non-empty cells in row') plt.bar(r_index, r) plt.xticks(r_index + width * 0.5, row_idx) plt.ylim(0, len(c)) plt.subplot(2, 1, 2) plt.xlabel('column index') plt.ylabel('number of non-empty cells in column') plt.bar(c_index, c) plt.xticks(c_index + width * 0.5, col_idx) plt.ylim(0, len(r)) plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300) def plot_all_barchart(data, output): r_empty_cells = [] for page_number in data.keys(): page = data[page_number] for table_number in page.keys(): table = page[table_number] r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']]) c = Counter(r_empty_cells) if 0.0 not in c: c.update({0.0: 0}) if 1.0 not in c: c.update({1.0: 0}) plt.figure(figsize=(8, 6)) plt.xlabel('percentage of non-empty cells in a row') plt.ylabel('percentage of rows processed') row_p = [count / float(sum(c.values())) for count in c.values()] plt.bar(c.keys(), row_p, align='center', width=0.05) plt.ylim(0, 1.0) plt.savefig(''.join([output, '_all.png']), dpi=300) def plot_rc_piechart(data, output): from matplotlib import cm tables = 0 rows, cols = [], [] for page_number in data.keys(): page = data[page_number] for table_number in page.keys(): table = page[table_number] tables += 1 rows.append(table['nrows']) cols.append(table['ncols']) r = Counter(rows) c = Counter(cols) plt.figure(figsize=(8, 6)) cs1 = cm.Set1(np.arange(len(r)) / float(len(r))) ax1 = plt.subplot(211, aspect='equal') ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90) ax1.set_title('row distribution across tables') cs2 = cm.Set1(np.arange(len(c)) / float(len(c))) ax2 = plt.subplot(212, aspect='equal') ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90) ax2.set_title('column distribution across tables') plt.savefig(''.join([output, '_rc.png']), dpi=300) def print_stats(data, p_time): from operator import itemgetter from itertools import groupby scores = [] continuous_tables = [] total_tables = 0 for page_number in data.keys(): page = data[page_number] total_tables += len(page.keys()) for table_number in page.keys(): table = page[table_number] continuous_tables.append((page_number, table_number, table['ncols'])) scores.append(table['score']) avg_score = np.mean(scores) ct_pages = [] header_string = "" if len(continuous_tables) > 1: tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:]))) for k, g in groupby(tables, key=itemgetter(2)): g = list(g) tables_same_ncols = set([int(t[0][5:]) for t in g]) tables_same_ncols = sorted(list(tables_same_ncols)) for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x): G = list(G) ct_pages.append((str(G[0][1]), str(G[-1][1]))) result_headers = [] for ct in ct_pages: header_idx = {} possible_headers = [] ncols = 0 for page_number in range(int(ct[0]), int(ct[1]) + 1): page = data['page-{0}'.format(page_number)] for table_number in page.keys(): table = page[table_number] ncols = table['ncols'] for i, row in enumerate(table['data']): try: header_idx[tuple(row)].append(i) except KeyError: header_idx[tuple(row)] = [i] possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10] possible_headers = filter(lambda z: len(z) == ncols, [filter(lambda x: x != '', p_h) for p_h in possible_headers]) modes = [] for p_h in possible_headers: try: modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count))) except KeyError: pass header = modes[modes.index(min(modes, key=lambda x: x[1]))][0] result_headers.append(header) header_string = "Multi-page table headers*:\n" header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format( '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip( ct_pages, result_headers)])]) avg_time = "Time taken per page: {0:.2f} seconds\n".format( p_time / float(len(data))) if len(data) not in [0, 1] else "" equal_ncols = "\nMulti-page tables on*: {0}\n".format( ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else "" stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols] stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n" "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats)) print(''.join([stat_string, header_string])) def convert_to_html(table): html = '' html = ''.join([html, '\n']) for row in table: html = ''.join([html, ' \n']) for data in row: html = ''.join([html, ' \n']) html = ''.join([html, ' \n']) html = ''.join([html, '
', data, '
\n']) return html def write_to_disk(data, f='csv', output=None, filename=None): # raise something if filename and/or output are None fname = os.path.basename(filename) froot, __ = os.path.splitext(fname) if f in ['csv', 'tsv']: delimiter = ',' if f == 'csv' else '\t' for page_number in sorted(data.keys()): if data[page_number] is not None: for table_number in sorted(data[page_number].keys()): dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f) with open(os.path.join(output, dsvname), 'w') as outfile: writer = csv.writer( outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) for row in data[page_number][table_number]['data']: writer.writerow(row) elif f == 'zip': csv_zip = os.path.join(output, '{0}.zip'.format(froot)) with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \ as zfile: for page_number in sorted(data.keys()): if data[page_number] is not None: for table_number in sorted(data[page_number].keys()): csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv') outfile = cStringIO.StringIO() writer = csv.writer( outfile, delimiter=',', quoting=csv.QUOTE_ALL) for row in data[page_number][table_number]['data']: writer.writerow(row) zfile.writestr(csvname, outfile.getvalue()) outfile.close() elif f == 'html': htmlname = '{0}.html'.format(froot) for page_number in sorted(data.keys()): for table_number in sorted(data[page_number].keys()): with open(os.path.join(output, htmlname), 'a') as htmlfile: htmlfile.write(convert_to_html(data[page_number][table_number]['data'])) elif f == 'json': import json with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \ as jsonfile: json.dump(data, jsonfile) elif f == 'xlsx': try: from pyexcel_xlsx import save_data from collections import OrderedDict xlsx_data = OrderedDict() for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])): sheet_name = ''.join([page_number, '_', table_number]) xlsx_data.update({sheet_name: [row for row in data[page_number][table_number]['data']]}) save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data) except ImportError: print("link to install docs") if __name__ == '__main__': start_time = time.time() args = docopt(doc, version='0.1', options_first=True) argv = [args['']] + args[''] if args[''] == 'lattice': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) elif args[''] == 'ocr': args.update(docopt(ocr_doc, argv=argv)) filename = args[''] filedir = os.path.dirname(args['']) logname, __ = os.path.splitext(filename) logname = ''.join([logname, '.log']) scorename, __ = os.path.splitext(filename) scorename = ''.join([scorename, '_info.csv']) pngname, __ = os.path.splitext(filename) FORMAT = '%(asctime)s - %(levelname)s - %(message)s' if args['--log'] is not None: logger = utils.setup_logging(args['--log']) else: logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log')) p = [] if args['--pages'] == '1': p.append({'start': 1, 'end': 1}) else: if args['--pages'] == 'all': infile = PdfFileReader(open(filename, 'rb'), strict=False) p.append({'start': 1, 'end': infile.getNumPages()}) else: for r in args['--pages'].split(','): if '-' in r: a, b = r.split('-') p.append({'start': int(a), 'end': int(b)}) else: p.append({'start': int(r), 'end': int(r)}) logger.info('Applying {0} method on {1}'.format(args[''], os.path.basename(filename))) margins = (float(args['--cmargin']), float(args['--lmargin']), float(args['--wmargin'])) if args[''] == 'lattice': try: kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, 'fill': args['--fill'] if args['--fill'] else None, 'headers': args['--header'] if args['--header'] else None, 'mtol': [int(m) for m in args['--mtol']], 'blocksize': int(args['--blocksize']), 'threshold_constant': float(args['--constant']), 'scale': int(args['--scale']), 'invert': args['--invert'], 'margins': margins, 'split_text': args['--split_text'], 'flag_size': args['--flag_size'], 'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'], 'debug': args['--debug'] } manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True, parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: pngname = os.path.join(args['--output'], os.path.basename(pngname)) plot_type = args['--plot'].split(',') if 'page' in plot_type: for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] plot_table_barchart(table['r_nempty_cells'], table['c_nempty_cells'], table['empty_p'], page_number, table_number) if 'all' in plot_type: plot_all_barchart(data, pngname) if 'rc' in plot_type: plot_rc_piechart(data, pngname) if args['--print-stats']: print_stats(data, processing_time) if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) with open(scorename, 'w') as score_file: score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( ''.join([page_number, '_', table_number]), table['nrows'], table['ncols'], table['empty_p'], table['line_p'], table['text_p'], table['score'])) if args['--debug']: manager.debug_plot() except Exception as e: logger.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'stream': try: kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, 'columns': args['--columns'] if args['--columns'] else None, 'headers': args['--header'] if args['--header'] else None, 'ytol': [int(y) for y in args['--ytol']], 'mtol': [int(m) for m in args['--mtol']], 'margins': margins, 'split_text': args['--split_text'], 'flag_size': args['--flag_size'], 'debug': args['--debug'] } manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True, parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: pngname = os.path.join(args['--output'], os.path.basename(pngname)) plot_type = args['--plot'].split(',') if 'page' in plot_type: for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] plot_table_barchart(table['r_nempty_cells'], table['c_nempty_cells'], table['empty_p'], page_number, table_number) if 'all' in plot_type: plot_all_barchart(data, pngname) if 'rc' in plot_type: plot_rc_piechart(data, pngname) if args['--print-stats']: print_stats(data, processing_time) if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) with open(scorename, 'w') as score_file: score_file.write('table,nrows,ncols,empty_p,,score\n') for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] score_file.write('{0},{1},{2},{3},{4}\n'.format( ''.join([page_number, '_', table_number]), table['nrows'], table['ncols'], table['empty_p'], table['score'])) if args['--debug']: manager.debug_plot() except Exception as e: logger.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'ocr': try: kwargs = { 'table_area': args['--tarea'] if args['--tarea'] else None, 'mtol': [int(m) for m in args['--mtol']], 'blocksize': int(args['--blocksize']), 'threshold_constant': float(args['--constant']), 'dpi': int(args['--dpi']), 'lang': args['--lang'], 'scale': int(args['--scale']), 'debug': args['--debug'] } manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True, parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time logger.info("Finished processing in " + str(processing_time) + " seconds") if args['--plot']: if args['--output']: pngname = os.path.join(args['--output'], os.path.basename(pngname)) plot_type = args['--plot'].split(',') if 'page' in plot_type: for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] plot_table_barchart(table['r_nempty_cells'], table['c_nempty_cells'], table['empty_p'], page_number, table_number) if 'all' in plot_type: plot_all_barchart(data, pngname) if 'rc' in plot_type: plot_rc_piechart(data, pngname) if args['--print-stats']: print_stats(data, processing_time) if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) with open(scorename, 'w') as score_file: score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): page = data[page_number] for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): table = page[table_number] score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( ''.join([page_number, '_', table_number]), table['nrows'], table['ncols'], table['empty_p'], table['line_p'], table['text_p'], table['score'])) if args['--debug']: manager.debug_plot() except Exception as e: logger.exception(e.message, exc_info=True) sys.exit() if args['--debug']: print("See 'camelot -h' for various parameters you can tweak.") else: output = filedir if args['--output'] is None else args['--output'] write_to_disk(data, f=args['--format'], output=output, filename=filename)