Remove old CLI code

2018-09-08 05:47:32 +05:30 · 2018-09-08 05:47:32 +05:30 · 9d2708171b
parent 329d5d4dc6
commit 9d2708171b
1 changed files with 0 additions and 508 deletions
--- a/tools/camelot
+++ b/tools/camelot
@ -1,508 +0,0 @@
-#!/usr/bin/env python2
-from __future__ import print_function
-import os
-import csv
-import sys
-import glob
-import time
-import zipfile
-import warnings
-import cStringIO
-
-import numpy as np
-from docopt import docopt
-from collections import Counter
-import matplotlib.pyplot as plt
-from PyPDF2 import PdfFileReader
-
-from camelot.pdf import Pdf
-from camelot.lattice import Lattice
-from camelot.stream import Stream
-from camelot import utils
-
-
-doc = """
-Camelot: PDF parsing made simpler!
-
-usage:
- camelot [options] <method> [<args>...]
-
-options:
- -h, --help                Show this screen.
- -v, --version             Show version.
- -p, --pages <pageno>      Comma-separated list of page numbers.
-                           Example: -p 1,3-6,10  [default: 1]
- -P, --parallel            Parallelize the parsing process.
- -f, --format <format>     Output format. (csv,tsv,zip,html,json,xlsx) [default: csv]
- -l, --log <logfile>       Log to file.
- -o, --output <directory>  Output directory.
- -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
-                           grouped together to form a word. [default: 1.0]
- -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
-                           grouped together to form a textbox. [default: 0.5]
- -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
-                           if distance between words is greater than word
-                           margin. [default: 0.1]
- -J, --split_text          Split text lines if they span across multiple cells.
- -K, --flag_size           Flag substring if its size differs from the whole string.
-                           Useful for super and subscripts.
- -X, --print-stats         List stats on the parsing process.
- -Y, --save-stats          Save stats to a file.
- -Z, --plot <dist>         Plot distributions. (page,all,rc)
-
-camelot methods:
- lattice  Looks for lines between data.
- stream   Looks for spaces between data.
-
-See 'camelot <method> -h' for more information on a specific method.
-"""
-
-lattice_doc = """
-Lattice method looks for lines between text to form a table.
-
-usage:
- camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
- [-j <jtol>...] [options] [--] <file>
-
-options:
- -t, --tarea <tarea>            Specific table areas to analyze.
- -F, --fill <fill>              Fill data in horizontal and/or vertical spanning
-                                cells. Example: -F h, -F v, -F hv
- -m, --mtol <mtol>              Tolerance to account for when merging lines
-                                which are very close. [default: 2]
- -j, --jtol <jtol>              Tolerance to account for when matching line endings
-                                with intersections. [default: 2]
- -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
- -C, --constant <constant>      See adaptive threshold doc. [default: -2]
- -s, --scale <scale>            Scaling factor. Large scaling factor leads to
-                                smaller lines being detected. [default: 15]
- -I, --iterations <iterations>  Number of iterations for dilation. [default: 0]
- -i, --invert                   Invert pdf image to make sure that lines are
-                                in foreground.
- -T, --shift_text <shift_text>  Specify where the text in a spanning cell
-                                should flow, order-sensitive. [default: lt]
- -d, --debug <debug>            Debug by visualizing pdf geometry.
-                                (contour,line,joint,table) Example: -d table
-"""
-
-stream_doc = """
-Stream method looks for whitespaces between text to form a table.
-
-usage:
- camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
- [-y <ytol>...] [options] [--] <file>
-
-options:
- -t, --tarea <tarea>      Specific table areas to analyze.
- -c, --columns <columns>  Comma-separated list of column x-coordinates.
-                          Example: -c 10.1,20.2,30.3
- -m, --mtol <mtol>        Tolerance to account for when merging columns
-                          together. [default: 0]
- -y, --ytol <ytol>        Tolerance to account for when grouping rows
-                          together. [default: 2]
- -d, --debug              Debug by visualizing textboxes.
-"""
-
-
-def plot_table_barchart(r, c, p, pno, tno):
-    row_idx = [i + 1 for i, row in enumerate(r)]
-    col_idx = [i + 1 for i, col in enumerate(c)]
-    r_index = np.arange(len(r))
-    c_index = np.arange(len(c))
-    width = 0.7
-
-    plt.figure(figsize=(8, 6))
-    plt.subplot(2, 1, 1)
-    plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
-    plt.xlabel('row index')
-    plt.ylabel('number of non-empty cells in row')
-    plt.bar(r_index, r)
-    plt.xticks(r_index + width * 0.5, row_idx)
-    plt.ylim(0, len(c))
-
-    plt.subplot(2, 1, 2)
-    plt.xlabel('column index')
-    plt.ylabel('number of non-empty cells in column')
-    plt.bar(c_index, c)
-    plt.xticks(c_index + width * 0.5, col_idx)
-    plt.ylim(0, len(r))
-    plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
-
-
-def plot_all_barchart(data, output):
-    r_empty_cells = []
-    for page_number in data.keys():
-        page = data[page_number]
-        for table_number in page.keys():
-            table = page[table_number]
-            r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
-    c = Counter(r_empty_cells)
-    if 0.0 not in c:
-        c.update({0.0: 0})
-    if 1.0 not in c:
-        c.update({1.0: 0})
-
-    plt.figure(figsize=(8, 6))
-    plt.xlabel('percentage of non-empty cells in a row')
-    plt.ylabel('percentage of rows processed')
-    row_p = [count / float(sum(c.values())) for count in c.values()]
-    plt.bar(c.keys(), row_p, align='center', width=0.05)
-    plt.ylim(0, 1.0)
-    plt.savefig(''.join([output, '_all.png']), dpi=300)
-
-
-def plot_rc_piechart(data, output):
-    from matplotlib import cm
-
-    tables = 0
-    rows, cols = [], []
-    for page_number in data.keys():
-        page = data[page_number]
-        for table_number in page.keys():
-            table = page[table_number]
-            tables += 1
-            rows.append(table['nrows'])
-            cols.append(table['ncols'])
-
-    r = Counter(rows)
-    c = Counter(cols)
-
-    plt.figure(figsize=(8, 6))
-    cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
-    ax1 = plt.subplot(211, aspect='equal')
-    ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
-    ax1.set_title('row distribution across tables')
-
-    cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
-    ax2 = plt.subplot(212, aspect='equal')
-    ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
-    ax2.set_title('column distribution across tables')
-    plt.savefig(''.join([output, '_rc.png']), dpi=300)
-
-
-def print_stats(data, p_time):
-    from operator import itemgetter
-    from itertools import groupby
-
-    scores = []
-    continuous_tables = []
-    total_tables = 0
-    for page_number in data.keys():
-        page = data[page_number]
-        total_tables += len(page.keys())
-        for table_number in page.keys():
-            table = page[table_number]
-            continuous_tables.append((page_number, table_number, table['ncols']))
-            scores.append(table['score'])
-    avg_score = np.mean(scores)
-
-    ct_pages = []
-    header_string = ""
-    if len(continuous_tables) > 1:
-        tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
-        for k, g in groupby(tables, key=itemgetter(2)):
-            g = list(g)
-            tables_same_ncols = set([int(t[0][5:]) for t in g])
-            tables_same_ncols = sorted(list(tables_same_ncols))
-            for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
-                G = list(G)
-                ct_pages.append((str(G[0][1]), str(G[-1][1])))
-
-        result_headers = []
-        for ct in ct_pages:
-            header_idx = {}
-            possible_headers = []
-            ncols = 0
-            for page_number in range(int(ct[0]), int(ct[1]) + 1):
-                page = data['page-{0}'.format(page_number)]
-                for table_number in page.keys():
-                    table = page[table_number]
-                    ncols = table['ncols']
-                    for i, row in enumerate(table['data']):
-                        try:
-                            header_idx[tuple(row)].append(i)
-                        except KeyError:
-                            header_idx[tuple(row)] = [i]
-            possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
-            possible_headers = filter(lambda z: len(z) == ncols,
-                [filter(lambda x: x != '', p_h) for p_h in possible_headers])
-            modes = []
-            for p_h in possible_headers:
-                try:
-                    modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
-                except KeyError:
-                    pass
-            header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
-            result_headers.append(header)
-
-        header_string = "Multi-page table headers*:\n"
-        header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
-            '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
-            ct_pages, result_headers)])])
-
-    avg_time = "Time taken per page: {0:.2f} seconds\n".format(
-        p_time / float(len(data))) if len(data) not in [0, 1] else ""
-    equal_ncols = "\nMulti-page tables on*: {0}\n".format(
-        ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else ""
-    stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
-    stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
-        "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
-
-    print(''.join([stat_string, header_string]))
-
-
-def convert_to_html(table):
-    html = ''
-    html = ''.join([html, '<table border="1">\n'])
-    for row in table:
-        html = ''.join([html, ' <tr>\n'])
-        for data in row:
-            html = ''.join([html, '  <td>', data, '</td>\n'])
-        html = ''.join([html, ' </tr>\n'])
-    html = ''.join([html, '</table>\n'])
-    return html
-
-
-def write_to_disk(data, f='csv', output=None, filename=None):
-    # raise something if filename and/or output are None
-    fname = os.path.basename(filename)
-    froot, __ = os.path.splitext(fname)
-    if f in ['csv', 'tsv']:
-        delimiter = ',' if f == 'csv' else '\t'
-        for page_number in sorted(data.keys()):
-            if data[page_number] is not None:
-                for table_number in sorted(data[page_number].keys()):
-                    dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
-                    with open(os.path.join(output, dsvname), 'w') as outfile:
-                        writer = csv.writer(
-                            outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
-                        for row in data[page_number][table_number]['data']:
-                            writer.writerow(row)
-    elif f == 'zip':
-        csv_zip = os.path.join(output, '{0}.zip'.format(froot))
-        with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \
-                as zfile:
-            for page_number in sorted(data.keys()):
-                if data[page_number] is not None:
-                    for table_number in sorted(data[page_number].keys()):
-                        csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv')
-                        outfile = cStringIO.StringIO()
-                        writer = csv.writer(
-                            outfile, delimiter=',', quoting=csv.QUOTE_ALL)
-                        for row in data[page_number][table_number]['data']:
-                            writer.writerow(row)
-                        zfile.writestr(csvname, outfile.getvalue())
-                        outfile.close()
-    elif f == 'html':
-        htmlname = '{0}.html'.format(froot)
-        for page_number in sorted(data.keys()):
-            for table_number in sorted(data[page_number].keys()):
-                with open(os.path.join(output, htmlname), 'a') as htmlfile:
-                    htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
-    elif f == 'json':
-        import json
-        with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
-                as jsonfile:
-            json.dump(data, jsonfile)
-    elif f == 'xlsx':
-        try:
-            from pyexcel_xlsx import save_data
-            from collections import OrderedDict
-            xlsx_data = OrderedDict()
-            for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
-                    sheet_name = ''.join([page_number, '_', table_number])
-                    xlsx_data.update({sheet_name:
-                                      [row for row in data[page_number][table_number]['data']]})
-            save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
-        except ImportError:
-            print("link to install docs")
-
-
-if __name__ == '__main__':
-    start_time = time.time()
-
-    args = docopt(doc, version='0.1', options_first=True)
-    argv = [args['<method>']] + args['<args>']
-    if args['<method>'] == 'lattice':
-        args.update(docopt(lattice_doc, argv=argv))
-    elif args['<method>'] == 'stream':
-        args.update(docopt(stream_doc, argv=argv))
-
-    filename = args['<file>']
-    filedir = os.path.dirname(args['<file>'])
-    logname, __ = os.path.splitext(filename)
-    logname = ''.join([logname, '.log'])
-    scorename, __ = os.path.splitext(filename)
-    scorename = ''.join([scorename, '_info.csv'])
-    pngname, __ = os.path.splitext(filename)
-
-    FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
-    if args['--log'] is not None:
-        logger = utils.setup_logging(args['--log'])
-    else:
-        logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))
-
-    p = []
-    if args['--pages'] == '1':
-        p.append({'start': 1, 'end': 1})
-    else:
-        infile = PdfFileReader(open(filename, 'rb'), strict=False)
-        if args['--pages'] == 'all':
-            p.append({'start': 1, 'end': infile.getNumPages()})
-        else:
-            for r in args['--pages'].split(','):
-                if '-' in r:
-                    a, b = r.split('-')
-                    if b == 'end':
-                        b = infile.getNumPages()
-                    p.append({'start': int(a), 'end': int(b)})
-                else:
-                    p.append({'start': int(r), 'end': int(r)})
-
-    logger.info('Applying {0} method on {1}'.format(args['<method>'],
-        os.path.basename(filename)))
-    margins = (float(args['--cmargin']), float(args['--lmargin']),
-        float(args['--wmargin']))
-    if args['<method>'] == 'lattice':
-        try:
-            kwargs = {
-                'table_area': args['--tarea'] if args['--tarea'] else None,
-                'fill': args['--fill'] if args['--fill'] else None,
-                'mtol': [int(m) for m in args['--mtol']],
-                'jtol': [int(j) for j in args['--jtol']],
-                'blocksize': int(args['--blocksize']),
-                'threshold_constant': float(args['--constant']),
-                'scale': int(args['--scale']),
-                'iterations': int(args['--iterations']),
-                'invert': args['--invert'],
-                'margins': margins,
-                'split_text': args['--split_text'],
-                'flag_size': args['--flag_size'],
-                'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
-                'debug': args['--debug']
-            }
-            manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
-                          parallel=args['--parallel'])
-            data = manager.extract()
-
-            processing_time = time.time() - start_time
-            logger.info("Finished processing in " + str(processing_time) + " seconds")
-
-            if args['--plot']:
-                if args['--output']:
-                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
-                plot_type = args['--plot'].split(',')
-                if 'page' in plot_type:
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            plot_table_barchart(table['r_nempty_cells'],
-                                table['c_nempty_cells'],
-                                table['empty_p'],
-                                page_number,
-                                table_number)
-
-                if 'all' in plot_type:
-                    plot_all_barchart(data, pngname)
-
-                if 'rc' in plot_type:
-                    plot_rc_piechart(data, pngname)
-
-            if args['--print-stats']:
-                print_stats(data, processing_time)
-
-            if args['--save-stats']:
-                if args['--output']:
-                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
-                with open(scorename, 'w') as score_file:
-                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
-                                ''.join([page_number, '_', table_number]),
-                                table['nrows'],
-                                table['ncols'],
-                                table['empty_p'],
-                                table['line_p'],
-                                table['text_p'],
-                                table['score']))
-            if args['--debug']:
-                manager.debug_plot()
-        except Exception as e:
-            logger.exception(e.message, exc_info=True)
-            sys.exit()
-    elif args['<method>'] == 'stream':
-        try:
-            kwargs = {
-                'table_area': args['--tarea'] if args['--tarea'] else None,
-                'columns': args['--columns'] if args['--columns'] else None,
-                'ytol': [int(y) for y in args['--ytol']],
-                'mtol': [int(m) for m in args['--mtol']],
-                'margins': margins,
-                'split_text': args['--split_text'],
-                'flag_size': args['--flag_size'],
-                'debug': args['--debug']
-            }
-            manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
-                          parallel=args['--parallel'])
-            data = manager.extract()
-
-            processing_time = time.time() - start_time
-            logger.info("Finished processing in " + str(processing_time) + " seconds")
-
-            if args['--plot']:
-                if args['--output']:
-                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
-                plot_type = args['--plot'].split(',')
-                if 'page' in plot_type:
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            plot_table_barchart(table['r_nempty_cells'],
-                                table['c_nempty_cells'],
-                                table['empty_p'],
-                                page_number,
-                                table_number)
-
-                if 'all' in plot_type:
-                    plot_all_barchart(data, pngname)
-
-                if 'rc' in plot_type:
-                    plot_rc_piechart(data, pngname)
-
-            if args['--print-stats']:
-                print_stats(data, processing_time)
-
-            if args['--save-stats']:
-                if args['--output']:
-                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
-                with open(scorename, 'w') as score_file:
-                    score_file.write('table,nrows,ncols,empty_p,,score\n')
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            score_file.write('{0},{1},{2},{3},{4}\n'.format(
-                                ''.join([page_number, '_', table_number]),
-                                table['nrows'],
-                                table['ncols'],
-                                table['empty_p'],
-                                table['score']))
-
-            if args['--debug']:
-                manager.debug_plot()
-        except Exception as e:
-            logger.exception(e.message, exc_info=True)
-            sys.exit()
-
-    if args.get('--debug') is not None and args['--debug']:
-        print("See 'camelot <method> -h' for various parameters you can tweak.")
-    else:
-        output = filedir if args['--output'] is None else args['--output']
-        write_to_disk(data, f=args['--format'],
-                      output=output, filename=filename)