#!/usr/bin/env python2
from __future__ import print_function
import os
import csv
import sys
import glob
import time
import zipfile
import warnings
import cStringIO

import numpy as np
from docopt import docopt
from collections import Counter
import matplotlib.pyplot as plt
from PyPDF2 import PdfFileReader

from camelot.pdf import Pdf
from camelot.lattice import Lattice
from camelot.stream import Stream
from camelot.ocr import OCR
from camelot import utils


doc = """
Camelot: PDF parsing made simpler!

usage:
 camelot [options] <method> [<args>...]

options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -P, --parallel            Parallelize the parsing process.
 -f, --format <format>     Output format. (csv,tsv,zip,html,json,xlsx) [default: csv]
 -l, --log <logfile>       Log to file.
 -o, --output <directory>  Output directory.
 -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
                           grouped together to form a word. [default: 1.0]
 -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
                           grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
                           if distance between words is greater than word
                           margin. [default: 0.1]
 -J, --split_text          Split text lines if they span across multiple cells.
 -K, --flag_size           Flag substring if its size differs from the whole string.
                           Useful for super and subscripts.
 -X, --print-stats         List stats on the parsing process.
 -Y, --save-stats          Save stats to a file.
 -Z, --plot <dist>         Plot distributions. (page,all,rc)

camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
 ocr      Looks for lines in image based pdfs.

See 'camelot <method> -h' for more information on a specific method.
"""

lattice_doc = """
Lattice method looks for lines between text to form a table.

usage:
 camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
 [-m <mtol>...] [options] [--] <file>

options:
 -t, --tarea <tarea>            Specific table areas to analyze.
 -F, --fill <fill>              Fill data in horizontal and/or vertical spanning
                                cells. Example: -F h, -F v, -F hv
 -H, --header <header>          Specify header for each table.
 -m, --mtol <mtol>              Tolerance to account for when merging lines
                                which are very close. [default: 2]
 -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
 -c, --constant <constant>      See adaptive threshold doc. [default: -2]
 -s, --scale <scale>            Scaling factor. Large scaling factor leads to
                                smaller lines being detected. [default: 15]
 -i, --invert                   Invert pdf image to make sure that lines are
                                in foreground.
 -T, --shift_text <shift_text>  Specify where the text in a spanning cell
                                should flow, order-sensitive. [default: lt]
 -d, --debug <debug>            Debug by visualizing pdf geometry.
                                (contour,line,joint,table) Example: -d table
"""

stream_doc = """
Stream method looks for whitespaces between text to form a table.

usage:
 camelot stream [-t <tarea>...] [-c <columns>...] [-H <header>...]
 [-y <ytol>...] [-m <mtol>...] [options] [--] <file>

options:
 -t, --tarea <tarea>      Specific table areas to analyze.
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -H, --header <header>    Specify header for each table.
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
 -m, --mtol <mtol>        Tolerance to account for when merging columns
                          together. [default: 0]
 -d, --debug              Debug by visualizing textboxes.
"""


ocr_doc = """
OCR method looks for lines in image based pdfs.

usage:
 camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>

options:
 -t, --tarea <tarea>          Specific table areas to analyze.
 -m, --mtol <mtol>            Tolerance to account for when merging lines
                              which are very close. [default: 2]
 -b, --blocksize <blocksize>  See adaptive threshold doc. [default: 15]
 -c, --constant <constant>    See adaptive threshold doc. [default: -2]
 -D, --dpi <dpi>              Dots per inch, specify image quality to be used for OCR.
                              [default: 300]
 -l, --lang <lang>            Specify language to be used for OCR. [default: eng]
 -s, --scale <scale>          Scaling factor. Large scaling factor leads to
                              smaller lines being detected. [default: 15]
 -d, --debug <debug>          Debug by visualizing pdf geometry.
                              (contour,line,joint,table) Example: -d table
"""


def plot_table_barchart(r, c, p, pno, tno):
    row_idx = [i + 1 for i, row in enumerate(r)]
    col_idx = [i + 1 for i, col in enumerate(c)]
    r_index = np.arange(len(r))
    c_index = np.arange(len(c))
    width = 0.7

    plt.figure(figsize=(8, 6))
    plt.subplot(2, 1, 1)
    plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
    plt.xlabel('row index')
    plt.ylabel('number of non-empty cells in row')
    plt.bar(r_index, r)
    plt.xticks(r_index + width * 0.5, row_idx)
    plt.ylim(0, len(c))

    plt.subplot(2, 1, 2)
    plt.xlabel('column index')
    plt.ylabel('number of non-empty cells in column')
    plt.bar(c_index, c)
    plt.xticks(c_index + width * 0.5, col_idx)
    plt.ylim(0, len(r))
    plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)


def plot_all_barchart(data, output):
    r_empty_cells = []
    for page_number in data.keys():
        page = data[page_number]
        for table_number in page.keys():
            table = page[table_number]
            r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
    c = Counter(r_empty_cells)
    if 0.0 not in c:
        c.update({0.0: 0})
    if 1.0 not in c:
        c.update({1.0: 0})

    plt.figure(figsize=(8, 6))
    plt.xlabel('percentage of non-empty cells in a row')
    plt.ylabel('percentage of rows processed')
    row_p = [count / float(sum(c.values())) for count in c.values()]
    plt.bar(c.keys(), row_p, align='center', width=0.05)
    plt.ylim(0, 1.0)
    plt.savefig(''.join([output, '_all.png']), dpi=300)


def plot_rc_piechart(data, output):
    from matplotlib import cm

    tables = 0
    rows, cols = [], []
    for page_number in data.keys():
        page = data[page_number]
        for table_number in page.keys():
            table = page[table_number]
            tables += 1
            rows.append(table['nrows'])
            cols.append(table['ncols'])

    r = Counter(rows)
    c = Counter(cols)

    plt.figure(figsize=(8, 6))
    cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
    ax1 = plt.subplot(211, aspect='equal')
    ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
    ax1.set_title('row distribution across tables')

    cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
    ax2 = plt.subplot(212, aspect='equal')
    ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
    ax2.set_title('column distribution across tables')
    plt.savefig(''.join([output, '_rc.png']), dpi=300)


def print_stats(data, p_time):
    from operator import itemgetter
    from itertools import groupby

    scores = []
    continuous_tables = []
    total_tables = 0
    for page_number in data.keys():
        page = data[page_number]
        total_tables += len(page.keys())
        for table_number in page.keys():
            table = page[table_number]
            continuous_tables.append((page_number, table_number, table['ncols']))
            scores.append(table['score'])
    avg_score = np.mean(scores)

    ct_pages = []
    header_string = ""
    if len(continuous_tables) > 1:
        tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
        for k, g in groupby(tables, key=itemgetter(2)):
            g = list(g)
            tables_same_ncols = set([int(t[0][5:]) for t in g])
            tables_same_ncols = sorted(list(tables_same_ncols))
            for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
                G = list(G)
                ct_pages.append((str(G[0][1]), str(G[-1][1])))

        result_headers = []
        for ct in ct_pages:
            header_idx = {}
            possible_headers = []
            ncols = 0
            for page_number in range(int(ct[0]), int(ct[1]) + 1):
                page = data['page-{0}'.format(page_number)]
                for table_number in page.keys():
                    table = page[table_number]
                    ncols = table['ncols']
                    for i, row in enumerate(table['data']):
                        try:
                            header_idx[tuple(row)].append(i)
                        except KeyError:
                            header_idx[tuple(row)] = [i]
            possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
            possible_headers = filter(lambda z: len(z) == ncols,
                [filter(lambda x: x != '', p_h) for p_h in possible_headers])
            modes = []
            for p_h in possible_headers:
                try:
                    modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
                except KeyError:
                    pass
            header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
            result_headers.append(header)

        header_string = "Multi-page table headers*:\n"
        header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
            '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
            ct_pages, result_headers)])])

    avg_time = "Time taken per page: {0:.2f} seconds\n".format(
        p_time / float(len(data))) if len(data) not in [0, 1] else ""
    equal_ncols = "\nMulti-page tables on*: {0}\n".format(
        ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) not in [0, 1] else ""
    stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
    stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
        "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))

    print(''.join([stat_string, header_string]))


def convert_to_html(table):
    html = ''
    html = ''.join([html, '<table border="1">\n'])
    for row in table:
        html = ''.join([html, ' <tr>\n'])
        for data in row:
            html = ''.join([html, '  <td>', data, '</td>\n'])
        html = ''.join([html, ' </tr>\n'])
    html = ''.join([html, '</table>\n'])
    return html


def write_to_disk(data, f='csv', output=None, filename=None):
    # raise something if filename and/or output are None
    fname = os.path.basename(filename)
    froot, __ = os.path.splitext(fname)
    if f in ['csv', 'tsv']:
        delimiter = ',' if f == 'csv' else '\t'
        for page_number in sorted(data.keys()):
            if data[page_number] is not None:
                for table_number in sorted(data[page_number].keys()):
                    dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
                    with open(os.path.join(output, dsvname), 'w') as outfile:
                        writer = csv.writer(
                            outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
                        for row in data[page_number][table_number]['data']:
                            writer.writerow(row)
    elif f == 'zip':
        csv_zip = os.path.join(output, '{0}.zip'.format(froot))
        with zipfile.ZipFile(csv_zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) \
                as zfile:
            for page_number in sorted(data.keys()):
                if data[page_number] is not None:
                    for table_number in sorted(data[page_number].keys()):
                        csvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), 'csv')
                        outfile = cStringIO.StringIO()
                        writer = csv.writer(
                            outfile, delimiter=',', quoting=csv.QUOTE_ALL)
                        for row in data[page_number][table_number]['data']:
                            writer.writerow(row)
                        zfile.writestr(csvname, outfile.getvalue())
                        outfile.close()
    elif f == 'html':
        htmlname = '{0}.html'.format(froot)
        for page_number in sorted(data.keys()):
            for table_number in sorted(data[page_number].keys()):
                with open(os.path.join(output, htmlname), 'a') as htmlfile:
                    htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
    elif f == 'json':
        import json
        with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
                as jsonfile:
            json.dump(data, jsonfile)
    elif f == 'xlsx':
        try:
            from pyexcel_xlsx import save_data
            from collections import OrderedDict
            xlsx_data = OrderedDict()
            for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
                    sheet_name = ''.join([page_number, '_', table_number])
                    xlsx_data.update({sheet_name:
                                      [row for row in data[page_number][table_number]['data']]})
            save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
        except ImportError:
            print("link to install docs")


if __name__ == '__main__':
    start_time = time.time()

    args = docopt(doc, version='0.1', options_first=True)
    argv = [args['<method>']] + args['<args>']
    if args['<method>'] == 'lattice':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))
    elif args['<method>'] == 'ocr':
        args.update(docopt(ocr_doc, argv=argv))

    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
    logname, __ = os.path.splitext(filename)
    logname = ''.join([logname, '.log'])
    scorename, __ = os.path.splitext(filename)
    scorename = ''.join([scorename, '_info.csv'])
    pngname, __ = os.path.splitext(filename)

    FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
    if args['--log'] is not None:
        logger = utils.setup_logging(args['--log'])
    else:
        logger = utils.setup_logging(os.path.join(os.getcwd(), 'camelot.log'))

    p = []
    if args['--pages'] == '1':
        p.append({'start': 1, 'end': 1})
    else:
        if args['--pages'] == 'all':
            infile = PdfFileReader(open(filename, 'rb'), strict=False)
            p.append({'start': 1, 'end': infile.getNumPages()})
        else:
            for r in args['--pages'].split(','):
                if '-' in r:
                    a, b = r.split('-')
                    p.append({'start': int(a), 'end': int(b)})
                else:
                    p.append({'start': int(r), 'end': int(r)})

    logger.info('Applying {0} method on {1}'.format(args['<method>'],
        os.path.basename(filename)))
    margins = (float(args['--cmargin']), float(args['--lmargin']),
        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
                'fill': args['--fill'] if args['--fill'] else None,
                'headers': args['--header'] if args['--header'] else None,
                'mtol': [int(m) for m in args['--mtol']],
                'blocksize': int(args['--blocksize']),
                'threshold_constant': float(args['--constant']),
                'scale': int(args['--scale']),
                'invert': args['--invert'],
                'margins': margins,
                'split_text': args['--split_text'],
                'flag_size': args['--flag_size'],
                'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
                'debug': args['--debug']
            }
            manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time
            logger.info("Finished processing in " + str(processing_time) + " seconds")

            if args['--plot']:
                if args['--output']:
                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
                plot_type = args['--plot'].split(',')
                if 'page' in plot_type:
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            plot_table_barchart(table['r_nempty_cells'],
                                table['c_nempty_cells'],
                                table['empty_p'],
                                page_number,
                                table_number)

                if 'all' in plot_type:
                    plot_all_barchart(data, pngname)

                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)

            if args['--print-stats']:
                print_stats(data, processing_time)

            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
                                ''.join([page_number, '_', table_number]),
                                table['nrows'],
                                table['ncols'],
                                table['empty_p'],
                                table['line_p'],
                                table['text_p'],
                                table['score']))
            if args['--debug']:
                manager.debug_plot()
        except Exception as e:
            logger.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
                'columns': args['--columns'] if args['--columns'] else None,
                'headers': args['--header'] if args['--header'] else None,
                'ytol': [int(y) for y in args['--ytol']],
                'mtol': [int(m) for m in args['--mtol']],
                'margins': margins,
                'split_text': args['--split_text'],
                'flag_size': args['--flag_size'],
                'debug': args['--debug']
            }
            manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time
            logger.info("Finished processing in " + str(processing_time) + " seconds")

            if args['--plot']:
                if args['--output']:
                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
                plot_type = args['--plot'].split(',')
                if 'page' in plot_type:
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            plot_table_barchart(table['r_nempty_cells'],
                                table['c_nempty_cells'],
                                table['empty_p'],
                                page_number,
                                table_number)

                if 'all' in plot_type:
                    plot_all_barchart(data, pngname)

                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)

            if args['--print-stats']:
                print_stats(data, processing_time)

            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
                    score_file.write('table,nrows,ncols,empty_p,,score\n')
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            score_file.write('{0},{1},{2},{3},{4}\n'.format(
                                ''.join([page_number, '_', table_number]),
                                table['nrows'],
                                table['ncols'],
                                table['empty_p'],
                                table['score']))

            if args['--debug']:
                manager.debug_plot()
        except Exception as e:
            logger.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'ocr':
        try:
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
                'mtol': [int(m) for m in args['--mtol']],
                'blocksize': int(args['--blocksize']),
                'threshold_constant': float(args['--constant']),
                'dpi': int(args['--dpi']),
                'lang': args['--lang'],
                'scale': int(args['--scale']),
                'debug': args['--debug']
            }
            manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time
            logger.info("Finished processing in " + str(processing_time) + " seconds")

            if args['--plot']:
                if args['--output']:
                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
                plot_type = args['--plot'].split(',')
                if 'page' in plot_type:
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            plot_table_barchart(table['r_nempty_cells'],
                                table['c_nempty_cells'],
                                table['empty_p'],
                                page_number,
                                table_number)

                if 'all' in plot_type:
                    plot_all_barchart(data, pngname)

                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)

            if args['--print-stats']:
                print_stats(data, processing_time)

            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
                                ''.join([page_number, '_', table_number]),
                                table['nrows'],
                                table['ncols'],
                                table['empty_p'],
                                table['line_p'],
                                table['text_p'],
                                table['score']))
            if args['--debug']:
                manager.debug_plot()
        except Exception as e:
            logger.exception(e.message, exc_info=True)
            sys.exit()

    if args['--debug']:
        print("See 'camelot <method> -h' for various parameters you can tweak.")
    else:
        output = filedir if args['--output'] is None else args['--output']
        write_to_disk(data, f=args['--format'],
                      output=output, filename=filename)