Add various metrics to score the quality of a parse

2016-08-30 14:52:49 +05:30 · 2016-08-30 14:52:49 +05:30 · 552f9cf422
parent 43a009dab4
commit 552f9cf422
11 changed files with 1027 additions and 472 deletions
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -1,18 +1,31 @@
-from __future__ import print_function
+from __future__ import division
 import os
+import types
+import copy_reg
+import logging

 import cv2
 import numpy as np

+from wand.image import Image
+
 from .table import Table
 from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
-                    get_row_index, get_column_index, reduce_index, outline,
-                    fill_spanning, remove_empty, encode_list)
+                    get_row_index, get_column_index, get_score, reduce_index,
+                    outline, fill_spanning, count_empty, encode_list, pdf_to_text)


 __all__ = ['Lattice']


+def _reduce_method(m):
+    if m.im_self is None:
+        return getattr, (m.im_class, m.im_func.func_name)
+    else:
+        return getattr, (m.im_self, m.im_func.func_name)
+copy_reg.pickle(types.MethodType, _reduce_method)
+
+
 def _morph_transform(imagename, scale=15, invert=False):
    """Morphological Transformation

@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
    vertical = threshold
    horizontal = threshold

-    verticalsize = vertical.shape[0] / scale
-    horizontalsize = horizontal.shape[1] / scale
+    verticalsize = vertical.shape[0] // scale
+    horizontalsize = horizontal.shape[1] // scale

    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):

    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
-    __, contours, __ = cv2.findContours(
-        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    try:
+        __, contours, __ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        contours, __ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

    tables = {}
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        roi = joints[y : y + h, x : x + w]
-        __, jc, __ = cv2.findContours(
-            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        try:
+            __, jc, __ = cv2.findContours(
+                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        except ValueError:
+            jc, __ = cv2.findContours(
+                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than <=4 joints
            continue
        joint_coords = []
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
        tables[(x, y + h, x + w, y)] = joint_coords

    v_segments, h_segments = [], []
-    _, vcontours, _ = cv2.findContours(
-        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    try:
+        _, vcontours, _ = cv2.findContours(
+            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        vcontours, _ = cv2.findContours(
+            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))

-    _, hcontours, _ = cv2.findContours(
-        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    try:
+        _, hcontours, _ = cv2.findContours(
+            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        hcontours, _ = cv2.findContours(
+            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
@ -160,24 +189,19 @@ class Lattice:
        page as value.
    """

-    def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
-                 invert=False, debug=None, verbose=False):
+    def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
+                 invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):

-        self.pdfobject = pdfobject
+        self.method = 'lattice'
        self.fill = fill
        self.scale = scale
        self.jtol = jtol
        self.mtol = mtol
        self.invert = invert
+        self.char_margin, self.line_margin, self.word_margin = pdf_margin
        self.debug = debug
-        self.verbose = verbose
-        self.tables = {}
-        if self.debug is not None:
-            self.debug_images = {}
-            self.debug_segments = {}
-            self.debug_tables = {}

-    def get_tables(self):
+    def get_tables(self, pdfname):
        """Returns all tables found in given pdf.

        Returns
@ -186,169 +210,124 @@ class Lattice:
            Dictionary with page number as key and list of tables on that
            page as value.
        """
-        vprint = print if self.verbose else lambda *a, **k: None
-        self.pdfobject.split()
-        self.pdfobject.convert()
-        for page in self.pdfobject.extract():
-            p, text, __, width, height = page
-            pkey = 'pg-{0}'.format(p)
-            imagename = os.path.join(
-                self.pdfobject.temp, '{}.png'.format(pkey))
-            pdf_x = width
-            pdf_y = height
-            img, table_bbox, v_segments, h_segments = _morph_transform(
-                imagename, scale=self.scale, invert=self.invert)
-            img_x = img.shape[1]
-            img_y = img.shape[0]
-            scaling_factor_x = pdf_x / float(img_x)
-            scaling_factor_y = pdf_y / float(img_y)
+        text, __, width, height = pdf_to_text(pdfname, self.char_margin,
+            self.line_margin, self.word_margin)
+        bname, __ = os.path.splitext(pdfname)
+        if not text:
+            logging.warning("{0}: PDF has no text. It may be an image.".format(
+                os.path.basename(bname)))
+            return None
+        imagename = ''.join([bname, '.png'])
+        with Image(filename=pdfname, depth=8, resolution=300) as png:
+            png.save(filename=imagename)
+        pdf_x = width
+        pdf_y = height
+        img, table_bbox, v_segments, h_segments = _morph_transform(
+            imagename, scale=self.scale, invert=self.invert)
+        img_x = img.shape[1]
+        img_y = img.shape[0]
+        scaling_factor_x = pdf_x / float(img_x)
+        scaling_factor_y = pdf_y / float(img_y)

-            if self.debug is not None:
-                self.debug_images[pkey] = (img, table_bbox)
+        if self.debug:
+            self.debug_images = (img, table_bbox)

-            factors = (scaling_factor_x, scaling_factor_y, img_y)
-            table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
-                                                           h_segments, factors)
+        factors = (scaling_factor_x, scaling_factor_y, img_y)
+        table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
+                                                       h_segments, factors)

-            if self.debug is not None:
-                self.debug_segments[pkey] = (v_segments, h_segments)
+        if self.debug:
+            self.debug_segments = (v_segments, h_segments)
+            self.debug_tables = []

-            if self.debug is not None:
-                debug_page_tables = []
-            page_tables = []
-            # sort tables based on y-coord
-            for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
-                # select edges which lie within table_bbox
-                text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
-                                                    h_segments)
-                rotated = detect_vertical(text_bbox)
-                cols, rows = zip(*table_bbox[k])
-                cols, rows = list(cols), list(rows)
-                cols.extend([k[0], k[2]])
-                rows.extend([k[1], k[3]])
-                # sort horizontal and vertical segments
-                cols = merge_close_values(sorted(cols), mtol=self.mtol)
-                rows = merge_close_values(
-                    sorted(rows, reverse=True), mtol=self.mtol)
-                # make grid using x and y coord of shortlisted rows and cols
-                cols = [(cols[i], cols[i + 1])
-                        for i in range(0, len(cols) - 1)]
-                rows = [(rows[i], rows[i + 1])
-                        for i in range(0, len(rows) - 1)]
-                table = Table(cols, rows)
-                # set table edges to True using ver+hor lines
-                table = table.set_edges(v_s, h_s, jtol=self.jtol)
-                # set spanning cells to True
-                table = table.set_spanning()
-                # set table border edges to True
-                table = outline(table)
+        pdf_page = {}
+        page_tables = {}
+        table_no = 1
+        # sort tables based on y-coord
+        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
+            # select edges which lie within table_bbox
+            table_info = {}
+            text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
+                                                h_segments)
+            table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
+            rotated = detect_vertical(text_bbox)
+            cols, rows = zip(*table_bbox[k])
+            cols, rows = list(cols), list(rows)
+            cols.extend([k[0], k[2]])
+            rows.extend([k[1], k[3]])
+            # sort horizontal and vertical segments
+            cols = merge_close_values(sorted(cols), mtol=self.mtol)
+            rows = merge_close_values(
+                sorted(rows, reverse=True), mtol=self.mtol)
+            # make grid using x and y coord of shortlisted rows and cols
+            cols = [(cols[i], cols[i + 1])
+                    for i in range(0, len(cols) - 1)]
+            rows = [(rows[i], rows[i + 1])
+                    for i in range(0, len(rows) - 1)]
+            table = Table(cols, rows)
+            # set table edges to True using ver+hor lines
+            table = table.set_edges(v_s, h_s, jtol=self.jtol)
+            nouse = table.nocont_ / (len(v_s) + len(h_s))
+            table_info['line_p'] = 100 * (1 - nouse)
+            # set spanning cells to True
+            table = table.set_spanning()
+            # set table border edges to True
+            table = outline(table)

-                if self.debug is not None:
-                    debug_page_tables.append(table)
+            if self.debug:
+                self.debug_tables.append(table)

-                # fill text after sorting it
-                if rotated == '':
-                    text_bbox.sort(key=lambda x: (-x.y0, x.x0))
-                elif rotated == 'left':
-                    text_bbox.sort(key=lambda x: (x.x0, x.y0))
-                elif rotated == 'right':
-                    text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
-                for t in text_bbox:
-                    r_idx = get_row_index(t, rows)
-                    c_idx = get_column_index(t, cols)
-                    if None in [r_idx, c_idx]:
-                        # couldn't assign LTChar to any cell
-                        pass
-                    else:
-                        r_idx, c_idx = reduce_index(
-                            table, rotated, r_idx, c_idx)
-                        table.cells[r_idx][c_idx].add_text(
-                            t.get_text().strip('\n'))
+            # fill text after sorting it
+            if rotated == '':
+                text_bbox.sort(key=lambda x: (-x.y0, x.x0))
+            elif rotated == 'left':
+                text_bbox.sort(key=lambda x: (x.x0, x.y0))
+            elif rotated == 'right':
+                text_bbox.sort(key=lambda x: (-x.x0, -x.y0))

-                if self.fill is not None:
-                    table = fill_spanning(table, fill=self.fill)
-                ar = table.get_list()
-                if rotated == 'left':
-                    ar = zip(*ar[::-1])
-                elif rotated == 'right':
-                    ar = zip(*ar[::1])
-                    ar.reverse()
-                ar = remove_empty(ar)
-                ar = [list(o) for o in ar]
-                page_tables.append(encode_list(ar))
-            vprint(pkey)
-            self.tables[pkey] = page_tables
+            rerror = []
+            cerror = []
+            for t in text_bbox:
+                try:
+                    r_idx, rass_error = get_row_index(t, rows)
+                except TypeError:
+                    # couldn't assign LTChar to any cell
+                    continue
+                try:
+                    c_idx, cass_error = get_column_index(t, cols)
+                except TypeError:
+                    # couldn't assign LTChar to any cell
+                    continue
+                rerror.append(rass_error)
+                cerror.append(cass_error)
+                r_idx, c_idx = reduce_index(
+                    table, rotated, r_idx, c_idx)
+                table.cells[r_idx][c_idx].add_text(
+                    t.get_text().strip('\n'))
+            score = get_score([[50, rerror], [50, cerror]])
+            table_info['score'] = score

-        if self.debug is not None:
-            self.debug_tables[pkey] = debug_page_tables
+            if self.fill is not None:
+                table = fill_spanning(table, fill=self.fill)
+            ar = table.get_list()
+            if rotated == 'left':
+                ar = zip(*ar[::-1])
+            elif rotated == 'right':
+                ar = zip(*ar[::1])
+                ar.reverse()
+            ar = encode_list(ar)
+            table_info['data'] = ar
+            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
+            table_info['empty_p'] = empty_p
+            table_info['r_nempty_cells'] = r_nempty_cells
+            table_info['c_nempty_cells'] = c_nempty_cells
+            table_info['nrows'] = len(ar)
+            table_info['ncols'] = len(ar[0])
+            page_tables['table_{0}'.format(table_no)] = table_info
+            table_no += 1
+        pdf_page[os.path.basename(bname)] = page_tables

-        if self.pdfobject.clean:
-            self.pdfobject.remove_tempdir()
-
-        if self.debug is not None:
+        if self.debug:
            return None

-        return self.tables
-
-    def plot_geometry(self, geometry):
-        """Plots various pdf geometries that are detected so user can choose
-        tweak scale, jtol, mtol parameters.
-        """
-        import matplotlib.pyplot as plt
-
-        if geometry == 'contour':
-            for pkey in self.debug_images.keys():
-                img, table_bbox = self.debug_images[pkey]
-                for t in table_bbox.keys():
-                    cv2.rectangle(img, (t[0], t[1]),
-                                  (t[2], t[3]), (255, 0, 0), 3)
-                plt.imshow(img)
-                plt.show()
-        elif geometry == 'joint':
-            x_coord = []
-            y_coord = []
-            for pkey in self.debug_images.keys():
-                img, table_bbox = self.debug_images[pkey]
-                for k in table_bbox.keys():
-                    for coord in table_bbox[k]:
-                        x_coord.append(coord[0])
-                        y_coord.append(coord[1])
-                max_x, max_y = max(x_coord), max(y_coord)
-                plt.plot(x_coord, y_coord, 'ro')
-                plt.axis([0, max_x + 100, max_y + 100, 0])
-                plt.imshow(img)
-                plt.show()
-        elif geometry == 'line':
-            for pkey in self.debug_segments.keys():
-                v_s, h_s = self.debug_segments[pkey]
-                for v in v_s:
-                    plt.plot([v[0], v[2]], [v[1], v[3]])
-                for h in h_s:
-                    plt.plot([h[0], h[2]], [h[1], h[3]])
-                plt.show()
-        elif geometry == 'table':
-            for pkey in self.debug_tables.keys():
-                for table in self.debug_tables[pkey]:
-                    for i in range(len(table.cells)):
-                        for j in range(len(table.cells[i])):
-                            if table.cells[i][j].left:
-                                plt.plot([table.cells[i][j].lb[0],
-                                          table.cells[i][j].lt[0]],
-                                         [table.cells[i][j].lb[1],
-                                          table.cells[i][j].lt[1]])
-                            if table.cells[i][j].right:
-                                plt.plot([table.cells[i][j].rb[0],
-                                          table.cells[i][j].rt[0]],
-                                         [table.cells[i][j].rb[1],
-                                          table.cells[i][j].rt[1]])
-                            if table.cells[i][j].top:
-                                plt.plot([table.cells[i][j].lt[0],
-                                          table.cells[i][j].rt[0]],
-                                         [table.cells[i][j].lt[1],
-                                          table.cells[i][j].rt[1]])
-                            if table.cells[i][j].bottom:
-                                plt.plot([table.cells[i][j].lb[0],
-                                          table.cells[i][j].rb[0]],
-                                         [table.cells[i][j].lb[1],
-                                          table.cells[i][j].rb[1]])
-                plt.show()
+        return pdf_page
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -1,18 +1,11 @@
 import os
 import shutil
 import tempfile
+import itertools
+import multiprocessing as mp

+import cv2
 from PyPDF2 import PdfFileReader, PdfFileWriter
-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfpage import PDFTextExtractionNotAllowed
-from pdfminer.pdfinterp import PDFResourceManager
-from pdfminer.pdfinterp import PDFPageInterpreter
-from pdfminer.pdfdevice import PDFDevice
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
-from wand.image import Image


 __all__ = ['Pdf']
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
    return page_numbers


-def _extract_text_objects(layout, LTObject, t=None):
-    """Recursively parses pdf layout to get a list of
-    text objects.
-
-    Parameters
-    ----------
-    layout : object
-        Layout object.
-
-    LTObject : object
-        Text object, either LTChar or LTTextLineHorizontal.
-
-    t : list (optional, default: None)
-
-    Returns
-    -------
-    t : list
-        List of text objects.
-    """
-    if t is None:
-        t = []
-    try:
-        for obj in layout._objs:
-            if isinstance(obj, LTObject):
-                t.append(obj)
-            else:
-                t += _extract_text_objects(obj, LTObject)
-    except AttributeError:
-        pass
-    return t
-
-
 class Pdf:
    """Handles all pdf operations which include:

@ -99,66 +60,163 @@ class Pdf:
        is greater than word_margin. (optional, default: 0.1)
    """

-    def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
-                 char_margin=2.0, line_margin=0.5, word_margin=0.1,
-                 clean=False):
+    def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
+                 parallel=False, clean=False):

+        self.extractor = extractor
        self.pdfname = pdfname
+        if not self.pdfname.endswith('.pdf'):
+            raise TypeError("Only PDF format is supported right now.")
        self.pagenos = _parse_page_numbers(pagenos)
-        self.char_margin = char_margin
-        self.line_margin = line_margin
-        self.word_margin = word_margin
+        self.parallel = parallel
+        self.cpu_count = mp.cpu_count()
+        self.pool = mp.Pool(processes=self.cpu_count)
        self.clean = clean
        self.temp = tempfile.mkdtemp()

    def split(self):
        """Splits pdf into single page pdfs.
        """
-        if not self.pdfname.endswith('.pdf'):
-            raise TypeError("Only PDF format is supported.")
        infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
        for p in self.pagenos:
            page = infile.getPage(p - 1)
            outfile = PdfFileWriter()
            outfile.addPage(page)
-            with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
+            with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
                outfile.write(f)

+    def remove_tempdir(self):
+        shutil.rmtree(self.temp)
+
    def extract(self):
        """Extracts text objects, width, height from a pdf.
        """
-        for p in self.pagenos:
-            pkey = 'pg-{0}'.format(p)
-            pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
-            with open(pname, 'r') as f:
-                parser = PDFParser(f)
-                document = PDFDocument(parser)
-                if not document.is_extractable:
-                    raise PDFTextExtractionNotAllowed
-                laparams = LAParams(char_margin=self.char_margin,
-                                    line_margin=self.line_margin,
-                                    word_margin=self.word_margin)
-                rsrcmgr = PDFResourceManager()
-                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-                interpreter = PDFPageInterpreter(rsrcmgr, device)
-                for page in PDFPage.create_pages(document):
-                    interpreter.process_page(page)
-                    layout = device.get_result()
-                    lattice_objects = _extract_text_objects(layout, LTChar)
-                    stream_objects = _extract_text_objects(
-                        layout, LTTextLineHorizontal)
-                    width = layout.bbox[2]
-                    height = layout.bbox[3]
-                yield p, lattice_objects, stream_objects, width, height
+        self.split()
+        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
+                 for p in self.pagenos]
+        if self.parallel:
+            tables = self.pool.map(self.extractor.get_tables, pages)
+            tables = {k: v for d in tables if d is not None for k, v in d.items()}
+        else:
+            tables = {}
+            if self.extractor.debug:
+                if self.extractor.method == 'stream':
+                    self.debug = self.extractor.debug
+                    self.debug_text = []
+                elif self.extractor.method == 'lattice':
+                    self.debug = self.extractor.debug
+                    self.debug_images = []
+                    self.debug_segments = []
+                    self.debug_tables = []
+            for p in pages:
+                table = self.extractor.get_tables(p)
+                if table is not None:
+                    tables.update(table)
+                if self.extractor.debug:
+                    if self.extractor.method == 'stream':
+                        self.debug_text.append(self.extractor.debug_text)
+                    elif self.extractor.method == 'lattice':
+                        self.debug_images.append(self.extractor.debug_images)
+                        self.debug_segments.append(self.extractor.debug_segments)
+                        self.debug_tables.append(self.extractor.debug_tables)
+        if self.clean:
+            self.remove_tempdir()
+        return tables

-    def convert(self):
-        """Converts single page pdfs to images.
+    def debug_plot(self):
+        """Plots all text objects and various pdf geometries so that
+        user can choose number of columns, columns x-coordinates for
+        Stream or tweak Lattice parameters (scale, jtol, mtol).
        """
-        for p in self.pagenos:
-            pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
-            imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
-            with Image(filename=pdfname, depth=8, resolution=300) as png:
-                png.save(filename=imagename)
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches

-    def remove_tempdir(self):
-        shutil.rmtree(self.temp)
+        if self.debug is True:
+            try:
+                for text in self.debug_text:
+                    fig = plt.figure()
+                    ax = fig.add_subplot(111, aspect='equal')
+                    xs, ys = [], []
+                    for t in text:
+                        xs.extend([t[0], t[1]])
+                        ys.extend([t[2], t[3]])
+                        ax.add_patch(
+                            patches.Rectangle(
+                                (t[0], t[1]),
+                                t[2] - t[0],
+                                t[3] - t[1]
+                            )
+                        )
+                    ax.set_xlim(min(xs) - 10, max(xs) + 10)
+                    ax.set_ylim(min(ys) - 10, max(ys) + 10)
+                    plt.show()
+            except AttributeError:
+                raise ValueError("This option only be used with Stream.")
+        elif self.debug == 'contour':
+            try:
+                for img, table_bbox in self.debug_images:
+                    for t in table_bbox.keys():
+                        cv2.rectangle(img, (t[0], t[1]),
+                                      (t[2], t[3]), (255, 0, 0), 3)
+                    plt.imshow(img)
+                    plt.show()
+            except AttributeError:
+                raise ValueError("This option only be used with Lattice.")
+        elif self.debug == 'joint':
+            try:
+                for img, table_bbox in self.debug_images:
+                    x_coord = []
+                    y_coord = []
+                    for k in table_bbox.keys():
+                        for coord in table_bbox[k]:
+                            x_coord.append(coord[0])
+                            y_coord.append(coord[1])
+                    max_x, max_y = max(x_coord), max(y_coord)
+                    plt.plot(x_coord, y_coord, 'ro')
+                    plt.axis([0, max_x + 100, max_y + 100, 0])
+                    plt.imshow(img)
+                    plt.show()
+            except AttributeError:
+                raise ValueError("This option only be used with Lattice.")
+        elif self.debug == 'line':
+            try:
+                for v_s, h_s in self.debug_segments:
+                    for v in v_s:
+                        plt.plot([v[0], v[2]], [v[1], v[3]])
+                    for h in h_s:
+                        plt.plot([h[0], h[2]], [h[1], h[3]])
+                    plt.show()
+            except AttributeError:
+                raise ValueError("This option only be used with Lattice.")
+        elif self.debug == 'table':
+            try:
+                for tables in self.debug_tables:
+                    for table in tables:
+                        for i in range(len(table.cells)):
+                            for j in range(len(table.cells[i])):
+                                if table.cells[i][j].left:
+                                    plt.plot([table.cells[i][j].lb[0],
+                                              table.cells[i][j].lt[0]],
+                                             [table.cells[i][j].lb[1],
+                                              table.cells[i][j].lt[1]])
+                                if table.cells[i][j].right:
+                                    plt.plot([table.cells[i][j].rb[0],
+                                              table.cells[i][j].rt[0]],
+                                             [table.cells[i][j].rb[1],
+                                              table.cells[i][j].rt[1]])
+                                if table.cells[i][j].top:
+                                    plt.plot([table.cells[i][j].lt[0],
+                                              table.cells[i][j].rt[0]],
+                                             [table.cells[i][j].lt[1],
+                                              table.cells[i][j].rt[1]])
+                                if table.cells[i][j].bottom:
+                                    plt.plot([table.cells[i][j].lb[0],
+                                              table.cells[i][j].rb[0]],
+                                             [table.cells[i][j].lb[1],
+                                              table.cells[i][j].rb[1]])
+                    plt.show()
+            except AttributeError:
+                raise ValueError("This option only be used with Lattice.")
+        else:
+            raise UserWarning("This method can only be called after"
+                " debug has been specified.")
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -1,14 +1,26 @@
-from __future__ import print_function
+from __future__ import division
 import os
+import types
+import copy_reg
+import logging

 import numpy as np

-from .utils import get_column_index, encode_list
+from .table import Table
+from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text


 __all__ = ['Stream']


+def _reduce_method(m):
+    if m.im_self is None:
+        return getattr, (m.im_class, m.im_func.func_name)
+    else:
+        return getattr, (m.im_self, m.im_func.func_name)
+copy_reg.pickle(types.MethodType, _reduce_method)
+
+
 def _group_rows(text, ytol=2):
    """Groups text objects into rows using ytol.

@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(row_y, t.y0, atol=ytol):
-                row_y = t.y0
-                rows.append(temp)
+                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
+                row_y = t.y0
            temp.append(t)
+    rows.append(sorted(temp, key=lambda t: t.x0))
+    __ = rows.pop(0) # hacky
    return rows


-def _merge_columns(l):
+def _merge_columns(l, mtol=2):
    """Merges overlapping columns and returns list with updated
    columns boundaries.

@ -62,7 +76,8 @@ def _merge_columns(l):
            merged.append(higher)
        else:
            lower = merged[-1]
-            if higher[0] <= lower[1]:
+            if (higher[0] <= lower[1] or
+                    np.isclose(higher[0], lower[1], atol=mtol)):
                upper_bound = max(lower[1], higher[1])
                lower_bound = min(lower[0], higher[0])
                merged[-1] = (lower_bound, upper_bound)
@ -71,6 +86,62 @@ def _merge_columns(l):
    return merged


+def _get_column_index(t, columns):
+    """Gets index of the column in which the given object falls by
+    comparing their co-ordinates.
+
+    Parameters
+    ----------
+    t : object
+
+    columns : list
+
+    Returns
+    -------
+    c : int
+    """
+    offset1, offset2 = 0, 0
+    lt_col_overlap = []
+    for c in columns:
+        if c[0] <= t.x1 and c[1] >= t.x0:
+            left = t.x0 if c[0] <= t.x0 else c[0]
+            right = t.x1 if c[1] >= t.x1 else c[1]
+            lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
+        else:
+            lt_col_overlap.append(-1)
+    if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
+        logging.warning("Text doesn't fit any column.")
+    c_idx = lt_col_overlap.index(max(lt_col_overlap))
+    if t.x0 < columns[c_idx][0]:
+        offset1 = abs(t.x0 - columns[c_idx][0])
+    if t.x1 > columns[c_idx][1]:
+        offset2 = abs(t.x1 - columns[c_idx][1])
+    Y = abs(t.y0 - t.y1)
+    charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
+    error = (Y * (offset1 + offset2)) / charea
+    return c_idx, error
+
+
+def _add_columns(cols, text, ytolerance):
+    if text:
+        text = _group_rows(text, ytol=ytolerance)
+        elements = [len(r) for r in text]
+        new_cols = [(t.x0, t.x1)
+            for r in text if len(r) == max(elements) for t in r]
+        cols.extend(_merge_columns(sorted(new_cols)))
+    return cols
+
+
+def _join_columns(cols, width):
+    cols = sorted(cols)
+    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
+    cols.insert(0, 0)
+    cols.append(width) # or some tolerance
+    cols = [(cols[i], cols[i + 1])
+            for i in range(0, len(cols) - 1)]
+    return cols
+
+
 class Stream:
    """Stream algorithm

@ -105,20 +176,18 @@ class Stream:
        page as value.
    """

-    def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
-                 debug=False, verbose=False):
+    def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
+                 pdf_margin=(2.0, 0.5, 0.1), debug=False):

-        self.pdfobject = pdfobject
+        self.method = 'stream'
        self.ncolumns = ncolumns
        self.columns = columns
        self.ytol = ytol
+        self.mtol = mtol
+        self.char_margin, self.line_margin, self.word_margin = pdf_margin
        self.debug = debug
-        self.verbose = verbose
-        self.tables = {}
-        if self.debug:
-            self.debug_text = {}

-    def get_tables(self):
+    def get_tables(self, pdfname):
        """Returns all tables found in given pdf.

        Returns
@ -127,86 +196,112 @@ class Stream:
            Dictionary with page number as key and list of tables on that
            page as value.
        """
-        vprint = print if self.verbose else lambda *a, **k: None
-        self.pdfobject.split()
-        for page in self.pdfobject.extract():
-            p, __, text, __, __ = page
-            pkey = 'pg-{0}'.format(p)
-            text.sort(key=lambda x: (-x.y0, x.x0))
-
-            if self.debug:
-                self.debug_text[pkey] = text
-
-            rows = _group_rows(text, ytol=self.ytol)
-            elements = [len(r) for r in rows]
-            # a table can't have just 1 column, can it?
-            elements = filter(lambda x: x != 1, elements)
-
-            guess = False
-            if self.columns:
-                cols = self.columns.split(',')
-                cols = [(float(cols[i]), float(cols[i + 1]))
-                        for i in range(0, len(cols) - 1)]
-            else:
-                guess = True
-                ncols = self.ncolumns if self.ncolumns else max(
-                    set(elements), key=elements.count)
-                if ncols == 0:
-                    # no tables detected
-                    continue
-                cols = [(t.x0, t.x1)
-                        for r in rows for t in r if len(r) == ncols]
-                cols = _merge_columns(sorted(cols))
-                cols = [(c[0] + c[1]) / 2.0 for c in cols]
-
-            ar = [['' for c in cols] for r in rows]
-            for r_idx, r in enumerate(rows):
-                for t in r:
-                    if guess:
-                        cog = (t.x0 + t.x1) / 2.0
-                        diff = [abs(cog - c) for c in cols]
-                        c_idx = diff.index(min(diff))
-                    else:
-                        c_idx = get_column_index(t, cols)
-                    if None in [r_idx, c_idx]:  # couldn't assign LTTextLH to any cell
-                        continue
-                    if ar[r_idx][c_idx]:
-                        ar[r_idx][c_idx] = ' '.join(
-                            [ar[r_idx][c_idx], t.get_text().strip()])
-                    else:
-                        ar[r_idx][c_idx] = t.get_text().strip()
-            vprint(pkey)
-            self.tables[pkey] = [encode_list(ar)]
-
-        if self.pdfobject.clean:
-            self.pdfobject.remove_tempdir()
+        __, text, width, height = pdf_to_text(pdfname, self.char_margin,
+            self.line_margin, self.word_margin)
+        bname, __ = os.path.splitext(pdfname)
+        if not text:
+            logging.warning("{0}: PDF has no text. It may be an image.".format(
+                os.path.basename(bname)))
+            return None
+        text.sort(key=lambda x: (-x.y0, x.x0))

        if self.debug:
+            self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
            return None

-        return self.tables
+        rows_grouped = _group_rows(text, ytol=self.ytol)
+        elements = [len(r) for r in rows_grouped]
+        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
+                    if len(r) > 0 else 0 for r in rows_grouped]
+        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
+        rows.insert(0, height) # or some tolerance
+        rows.append(0)
+        rows = [(rows[i], rows[i + 1])
+                for i in range(0, len(rows) - 1)]

-    def plot_text(self):
-        """Plots all text objects so user can choose number of columns
-        or columns x-coordinates using the matplotlib interface.
-        """
-        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
+        guess = False
+        if self.columns:
+            # user has to input boundary columns too
+            # take (0, width) by default
+            # similar to else condition
+            # len can't be 1
+            cols = self.columns.split(',')
+            cols = [(float(cols[i]), float(cols[i + 1]))
+                    for i in range(0, len(cols) - 1)]
+        else:
+            if self.ncolumns:
+                ncols = self.ncolumns
+                cols = [(t.x0, t.x1)
+                    for r in rows_grouped if len(r) == ncols for t in r]
+                cols = _merge_columns(sorted(cols), mtol=self.mtol)
+                if len(cols) != self.ncolumns:
+                    logging.warning("{}: The number of columns after merge"
+                                  " isn't the same as what you specified."
+                                  " Change the value of mtol.".format(
+                                  os.path.basename(bname)))
+                cols = _join_columns(cols, width)
+            else:
+                guess = True
+                ncols = max(set(elements), key=elements.count)
+                len_non_mode = len(filter(lambda x: x != ncols, elements))
+                if ncols == 1 and not self.debug:
+                    # no tables detected
+                    logging.warning("{}: Only one column was detected, the PDF"
+                                  " may have no tables. Specify ncols if"
+                                  " the PDF has tables.".format(
+                                  os.path.basename(bname)))
+                cols = [(t.x0, t.x1)
+                    for r in rows_grouped if len(r) == ncols for t in r]
+                cols = _merge_columns(sorted(cols), mtol=self.mtol)
+                inner_text = []
+                for i in range(1, len(cols)):
+                    left = cols[i - 1][1]
+                    right = cols[i][0]
+                    inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
+                outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
+                inner_text.extend(outer_text)
+                cols = _add_columns(cols, inner_text, self.ytol)
+                cols = _join_columns(cols, width)

-        for pkey in sorted(self.debug_text.keys()):
-            fig = plt.figure()
-            ax = fig.add_subplot(111, aspect='equal')
-            xs, ys = [], []
-            for t in self.debug_text[pkey]:
-                xs.extend([t.x0, t.x1])
-                ys.extend([t.y0, t.y1])
-                ax.add_patch(
-                    patches.Rectangle(
-                        (t.x0, t.y0),
-                        t.x1 - t.x0,
-                        t.y1 - t.y0
-                    )
-                )
-            ax.set_xlim(min(xs) - 10, max(xs) + 10)
-            ax.set_ylim(min(ys) - 10, max(ys) + 10)
-            plt.show()
+        pdf_page = {}
+        page_tables = {}
+        table_info = {}
+        table = Table(cols, rows)
+        rerror = []
+        cerror = []
+        for row in rows_grouped:
+            for t in row:
+                try:
+                    r_idx, rass_error = get_row_index(t, rows)
+                except ValueError as e:
+                    # couldn't assign LTTextLH to any cell
+                    vprint(e.message)
+                    continue
+                try:
+                    c_idx, cass_error = _get_column_index(t, cols)
+                except ValueError as e:
+                    # couldn't assign LTTextLH to any cell
+                    vprint(e.message)
+                    continue
+                rerror.append(rass_error)
+                cerror.append(cass_error)
+                table.cells[r_idx][c_idx].add_text(
+                    t.get_text().strip('\n'))
+        if guess:
+            score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
+        else:
+            score = get_score([[50, rerror], [50, cerror]])
+        table_info['score'] = score
+        ar = table.get_list()
+        ar = encode_list(ar)
+        table_info['data'] = ar
+        empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
+        table_info['empty_p'] = empty_p
+        table_info['r_nempty_cells'] = r_nempty_cells
+        table_info['c_nempty_cells'] = c_nempty_cells
+        table_info['nrows'] = len(ar)
+        table_info['ncols'] = len(ar[0])
+        page_tables['table_1'] = table_info
+        pdf_page[os.path.basename(bname)] = page_tables
+
+        return pdf_page
--- a/camelot/table.py
+++ b/camelot/table.py
@ -26,6 +26,7 @@ class Table:
        self.rows = rows
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in cols] for r in rows]
+        self.nocont_ = 0

    def set_edges(self, vertical, horizontal, jtol=2):
        """Sets cell edges to True if corresponding line segments
@ -53,6 +54,7 @@ class Table:
            k = [k for k, t in enumerate(self.rows)
                 if np.isclose(v[1], t[0], atol=jtol)]
            if not j:
+                self.nocont_ += 1
                continue
            J = j[0]
            if i == [0]:  # only left edge
@ -104,6 +106,7 @@ class Table:
            k = [k for k, t in enumerate(self.cols)
                 if np.isclose(h[2], t[0], atol=jtol)]
            if not j:
+                self.nocont_ += 1
                continue
            J = j[0]
            if i == [0]:  # only top edge
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,5 +1,18 @@
+from __future__ import division
+import os
+
 import numpy as np

+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
+

 def translate(x1, x2):
    """Translates x2 by x1.
@ -243,15 +256,24 @@ def get_row_index(t, rows):
    ----------
    t : object

-    rows : list
+    rows : list, sorted in decreasing order

    Returns
    -------
    r : int
    """
+    offset1, offset2 = 0, 0
    for r in range(len(rows)):
        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
-            return r
+            if t.y0 > rows[r][0]:
+                offset1 = abs(t.y0 - rows[r][0])
+            if t.y1 < rows[r][1]:
+                offset2 = abs(t.y1 - rows[r][1])
+            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
+            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
+            charea = X * Y
+            error = (X * (offset1 + offset2)) / charea
+            return r, error


 def get_column_index(t, columns):
@ -268,9 +290,45 @@ def get_column_index(t, columns):
    -------
    c : int
    """
+    offset1, offset2 = 0, 0
    for c in range(len(columns)):
        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
-            return c
+            if t.x0 < columns[c][0]:
+                offset1 = abs(t.x0 - columns[c][0])
+            if t.x1 > columns[c][1]:
+                offset2 = abs(t.x1 - columns[c][1])
+            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
+            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
+            charea = X * Y
+            error = (Y * (offset1 + offset2)) / charea
+            return c, error
+
+
+def get_score(error_weights):
+    """Calculates score based on weights assigned to various parameters,
+    and their error percentages.
+
+    Parameters
+    ----------
+    error_weights : dict
+        Dict with a tuple of error percentages as key and weightage
+        assigned to them as value. Sum of all values should be equal
+        to 100.
+
+    Returns
+    -------
+    score : float
+    """
+    SCORE_VAL = 100
+    score = 0
+    if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
+        raise ValueError("Please assign a valid weightage to each parameter"
+                         " such that their sum is equal to 100")
+    for ew in error_weights:
+        weight = ew[0] / len(ew[1])
+        for error_percentage in ew[1]:
+            score += weight * (1 - error_percentage)
+    return score


 def reduce_index(t, rotated, r_idx, c_idx):
@ -394,6 +452,110 @@ def remove_empty(d):
    return d


+def count_empty(d):
+    """Counts empty rows and columns from list of lists.
+
+    Parameters
+    ----------
+    d : list
+
+    Returns
+    -------
+    n_empty_rows : number of empty rows
+    n_empty_cols : number of empty columns
+    empty_p : percentage of empty cells
+    """
+    empty_p = 0
+    r_nempty_cells, c_nempty_cells = [], []
+    for i in d:
+        for j in i:
+            if j.strip() == '':
+                empty_p += 1
+    empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
+    for row in d:
+        r_nempty_c = 0
+        for r in row:
+            if r.strip() != '':
+                r_nempty_c += 1
+        r_nempty_cells.append(r_nempty_c)
+    d = zip(*d)
+    d = [list(col) for col in d]
+    for col in d:
+        c_nempty_c = 0
+        for c in col:
+            if c.strip() != '':
+                c_nempty_c += 1
+        c_nempty_cells.append(c_nempty_c)
+    return empty_p, r_nempty_cells, c_nempty_cells
+
+
 def encode_list(ar):
+    """Encodes list of text.
+
+    Parameters
+    ----------
+    ar : list
+
+    Returns
+    -------
+    ar : list
+    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar
+
+
+def extract_text_objects(layout, LTObject, t=None):
+    """Recursively parses pdf layout to get a list of
+    text objects.
+
+    Parameters
+    ----------
+    layout : object
+        Layout object.
+
+    LTObject : object
+        Text object, either LTChar or LTTextLineHorizontal.
+
+    t : list (optional, default: None)
+
+    Returns
+    -------
+    t : list
+        List of text objects.
+    """
+    if t is None:
+        t = []
+    try:
+        for obj in layout._objs:
+            if isinstance(obj, LTObject):
+                t.append(obj)
+            else:
+                t += extract_text_objects(obj, LTObject)
+    except AttributeError:
+        pass
+    return t
+
+
+def pdf_to_text(pname, char_margin, line_margin, word_margin):
+    # pkey = 'page-{0}'.format(p)
+    # pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
+    with open(pname, 'r') as f:
+        parser = PDFParser(f)
+        document = PDFDocument(parser)
+        if not document.is_extractable:
+            raise PDFTextExtractionNotAllowed
+        laparams = LAParams(char_margin=char_margin,
+                            line_margin=line_margin,
+                            word_margin=word_margin)
+        rsrcmgr = PDFResourceManager()
+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.create_pages(document):
+            interpreter.process_page(page)
+            layout = device.get_result()
+            lattice_objects = extract_text_objects(layout, LTChar)
+            stream_objects = extract_text_objects(
+                layout, LTTextLineHorizontal)
+            width = layout.bbox[2]
+            height = layout.bbox[3]
+        return lattice_objects, stream_objects, width, height
--- a/docs/index.rst
+++ b/docs/index.rst
@ -39,7 +39,7 @@ Usage

    >>> extractor = Lattice(Pdf('us-030.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
--- a/docs/lattice.rst
+++ b/docs/lattice.rst
@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x

    >>> extractor = Lattice(Pdf('us-030.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`

    >>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::
   :header: "Plan Type","County","Plan  Name","Totals"
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S

    >>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::
   :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"
--- a/docs/stream.rst
+++ b/docs/stream.rst
@ -17,7 +17,7 @@ Let's run it on this PDF.

    >>> extractor = Stream(Pdf('eu-027.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. .. _this: insert link for eu-027.pdf

@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.

    >>> extractor = Stream(Pdf('missing_values.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. .. _this: insert link for missing_values.pdf

@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last

    >>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::

@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.

    >>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]

 .. csv-table::

--- a/tests/test_lattice.py
+++ b/tests/test_lattice.py
@ -26,7 +26,7 @@ def test_lattice_basic():
    extractor = Lattice(Pdf(pdfname,
                            pagenos=[{'start': 2, 'end': 2}], clean=True))
    tables = extractor.get_tables()
-    assert_equal(tables['pg-2'][0], data)
+    assert_equal(tables['page-2'][0], data)


 def test_lattice_fill():
@ -76,7 +76,7 @@ def test_lattice_fill():
    pdfname = os.path.join(testdir, 'row_span_1.pdf')
    extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['pagea-1'][0], data)


 def test_lattice_invert():
@ -94,4 +94,4 @@ def test_lattice_invert():
    pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
    extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][1], data)
+    assert_equal(tables['page-1'][1], data)
--- a/tests/test_stream.py
+++ b/tests/test_stream.py
@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
 def test_stream_basic():

    data = [
-        ["","","","",""],
-        ["C Appendix C: Summary Statistics","","","",""],
-        ["","Table C1: Summary Statistics","","",""],
-        ["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""],
-        ["Variable","Mean","Std. Dev. Min","","Max"],
-        ["Age","50.8","15.9","21","90"],
-        ["Men","0.47","0.50","0","1"],
-        ["East","0.28","0.45","0","1"],
-        ["Rural","0.15","0.36","0","1"],
-        ["Married","0.57","0.50","0","1"],
-        ["Single","0.21","0.40","0","1"],
-        ["Divorced","0.13","0.33","0","1"],
-        ["Widowed","0.08","0.26","0","1"],
-        ["Separated","0.03","0.16","0","1"],
-        ["Partner","0.65","0.48","0","1"],
-        ["Employed","0.55","0.50","0","1"],
-        ["Fulltime","0.34","0.47","0","1"],
-        ["Parttime","0.20","0.40","0","1"],
-        ["Unemployed","0.08","0.28","0","1"],
-        ["Homemaker","0.19","0.40","0","1"],
-        ["Retired","0.28","0.45","0","1"],
-        ["Household size","2.43","1.22","1","9"],
-        ["Households with children","0.37","0.48","0","1"],
-        ["Number of children","1.67","1.38","0","8"],
-        ["Lower secondary education","0.08","0.27","0","1"],
-        ["Upper secondary education","0.60","0.49","0","1"],
-        ["Post secondary, non tert. education","0.12","0.33","0","1"],
-        ["First stage tertiary education","0.17","0.38","0","1"],
-        ["Other education","0.03","0.17","0","1"],
-        ["Household income (Euro/month)","2,127","1,389","22","22,500"],
-        ["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"],
-        ["Gross ﬁnancial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"],
-        ["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""],
-        ["","","","","ECB"],
-        ["","","","","Working Paper Series No 1299"],
-        ["","","","","Febuary 2011"]
+        ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
+        ["Entidad","","Municipio","","Localidad",""],
+        ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
+        ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
+        ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
+        ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
+        ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
+        ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
+        ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
+        ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
+        ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
+        ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
+        ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
+        ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
+        ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
+        ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
+        ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
+        ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
+        ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
+        ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
+        ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
+        ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
+        ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
+        ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
+        ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
+        ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
+        ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
+        ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
+        ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
+        ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
+        ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
+        ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
+        ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
+        ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
+        ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
+        ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
+        ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
+        ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
+        ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
+        ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
+        ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
+        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
+        ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
    ]

-    pdfname = os.path.join(testdir,
-        "tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf")
-    extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
+    pdfname = os.path.join(testdir, 'mexican_towns.pdf')
+    extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
                           clean=True))
    tables = extractor.get_tables()
-    assert_equal(tables['pg-3'][0], data)
+    assert_equal(tables['page-1'][0], data)


 def test_stream_ncolumns():

    data = [
-        ["","","","",""],
-        ["","Bhandara - Key Indicators","","",""],
+        ["Bhandara - Key Indicators","","","",""],
        ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
        ["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
        ["Reported Prevalence of Morbidity","","","",""],
@ -105,21 +110,20 @@ def test_stream_ncolumns():
        ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
        ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
        ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
-        ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""]
+        ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
+        ["","4","","",""]
    ]
    pdfname = os.path.join(testdir, 'missing_values.pdf')
    extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
                       ncolumns=5)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['page-1'][0], data)


 def test_stream_columns():

    data = [
-        ["","","","","",""],
-        ["Clave","","Clave","","Clave",""],
-        ["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
+        ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
        ["Entidad","","Municipio","","Localidad",""],
        ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
        ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
@ -160,10 +164,11 @@ def test_stream_columns():
        ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
        ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
        ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
-        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"]
+        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
+        ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
    ]
    pdfname = os.path.join(testdir, 'mexican_towns.pdf')
    extractor = Stream(Pdf(pdfname, clean=True),
                       columns='28,67,180,230,425,475,700')
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['page-1'][0], data)
--- a/tools/camelot
+++ b/tools/camelot
@ -4,8 +4,12 @@ import os
 import sys
 import time
 import logging
+import warnings

+import numpy as np
 from docopt import docopt
+from collections import Counter
+import matplotlib.pyplot as plt
 from PyPDF2 import PdfFileReader

 from camelot.pdf import Pdf
@ -22,12 +26,23 @@ usage:
 options:
 -h, --help                Show this screen.
 -v, --version             Show version.
+ -V, --verbose             Verbose.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
+ -P, --parallel            Parallelize the parsing process.
 -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
- -l, --log                 Print log to file.
- -V, --verbose             Verbose.
+ -l, --log                 Log to file.
 -o, --output <directory>  Output directory.
+ -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
+                           grouped together to form a word. [default: 2.0]
+ -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
+                           grouped together to form a textbox. [default: 0.5]
+ -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
+                           if distance between words is greater than word
+                           margin. [default: 0.1]
+ -S, --save-info           Save parsing info for each page to a file.
+ -X, --plot <dist>         Plot distributions. (page,all,rc)
+ -Z, --summary             Summarize metrics.

 camelot methods:
 lattice  Looks for lines between data.
@ -47,12 +62,12 @@ options:
                      cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
+ -i, --invert         Invert pdf image to make sure that lines are
+                      in foreground.
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
- -i, --invert         Invert pdf image to make sure that lines are
-                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
 """
@ -69,17 +84,159 @@ options:
                          Example: -c 10.1,20.2,30.3
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
- -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
-                          grouped together to form a word. [default: 2.0]
- -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
-                          grouped together to form a textbox. [default: 0.5]
- -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
-                          if distance between words is greater than word
-                          margin. [default: 0.1]
+ -m, --mtol <mtol>        Tolerance to account for when merging columns
+                          together. [default: 2]
 -d, --debug              Debug by visualizing textboxes.
 """


+def plot_table_barchart(r, c, p, pno, tno):
+    row_idx = [i + 1 for i, row in enumerate(r)]
+    col_idx = [i + 1 for i, col in enumerate(c)]
+    r_index = np.arange(len(r))
+    c_index = np.arange(len(c))
+    width = 0.7
+
+    plt.figure(figsize=(8, 6))
+    plt.subplot(2, 1, 1)
+    plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
+    plt.xlabel('row index')
+    plt.ylabel('number of non-empty cells in row')
+    plt.bar(r_index, r)
+    plt.xticks(r_index + width * 0.5, row_idx)
+    plt.ylim(0, len(c))
+
+    plt.subplot(2, 1, 2)
+    plt.xlabel('column index')
+    plt.ylabel('number of non-empty cells in column')
+    plt.bar(c_index, c)
+    plt.xticks(c_index + width * 0.5, col_idx)
+    plt.ylim(0, len(r))
+    plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
+
+
+def plot_all_barchart(data, output):
+    r_empty_cells = []
+    for page_number in data.keys():
+        page = data[page_number]
+        for table_number in page.keys():
+            table = page[table_number]
+            r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
+    c = Counter(r_empty_cells)
+    if 0.0 not in c:
+        c.update({0.0: 0})
+    if 1.0 not in c:
+        c.update({1.0: 0})
+
+    plt.figure(figsize=(8, 6))
+    plt.xlabel('percentage of non-empty cells in a row')
+    plt.ylabel('percentage of rows processed')
+    row_p = [count / float(sum(c.values())) for count in c.values()]
+    plt.bar(c.keys(), row_p, align='center', width=0.05)
+    plt.ylim(0, 1.0)
+    plt.savefig(''.join([output, '_all.png']), dpi=300)
+
+
+def plot_rc_piechart(data, output):
+    from matplotlib import cm
+
+    tables = 0
+    rows, cols = [], []
+    for page_number in data.keys():
+        page = data[page_number]
+        for table_number in page.keys():
+            table = page[table_number]
+            tables += 1
+            rows.append(table['nrows'])
+            cols.append(table['ncols'])
+
+    r = Counter(rows)
+    c = Counter(cols)
+
+    plt.figure(figsize=(8, 6))
+    cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
+    ax1 = plt.subplot(211, aspect='equal')
+    ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
+    ax1.set_title('row distribution across tables')
+
+    cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
+    ax2 = plt.subplot(212, aspect='equal')
+    ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
+    ax2.set_title('column distribution across tables')
+    plt.savefig(''.join([output, '_rc.png']), dpi=300)
+
+
+def summary(data, p_time):
+    from operator import itemgetter
+    from itertools import groupby
+
+    scores = []
+    continuous_tables = []
+    total_tables = 0
+    for page_number in data.keys():
+        page = data[page_number]
+        total_tables += len(page.keys())
+        for table_number in page.keys():
+            table = page[table_number]
+            continuous_tables.append((page_number, table_number, table['ncols']))
+            scores.append(table['score'])
+    avg_score = np.mean(scores)
+
+    ct_pages = []
+    header_string = ""
+    if len(continuous_tables) > 1:
+        tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
+        for k, g in groupby(tables, key=itemgetter(2)):
+            g = list(g)
+            tables_same_ncols = set([int(t[0][5:]) for t in g])
+            tables_same_ncols = sorted(list(tables_same_ncols))
+            for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
+                G = list(G)
+                ct_pages.append((str(G[0][1]), str(G[-1][1])))
+
+        result_headers = []
+        for ct in ct_pages:
+            header_idx = {}
+            possible_headers = []
+            ncols = 0
+            for page_number in range(int(ct[0]), int(ct[1]) + 1):
+                page = data['page-{0}'.format(page_number)]
+                for table_number in page.keys():
+                    table = page[table_number]
+                    ncols = table['ncols']
+                    for i, row in enumerate(table['data']):
+                        try:
+                            header_idx[tuple(row)].append(i)
+                        except KeyError:
+                            header_idx[tuple(row)] = [i]
+            possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
+            possible_headers = filter(lambda z: len(z) == ncols,
+                [filter(lambda x: x != '', p_h) for p_h in possible_headers])
+            modes = []
+            for p_h in possible_headers:
+                try:
+                    modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
+                except KeyError:
+                    pass
+            header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
+            result_headers.append(header)
+
+        header_string = "Multi-page table headers*:\n"
+        header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
+            '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
+            ct_pages, result_headers)])])
+
+    avg_time = "Time taken per page: {0:.2f} seconds\n".format(
+        p_time / float(len(data))) if len(data) != 1 else ""
+    equal_ncols = "\nMulti-page tables on*: {0}\n".format(
+        ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
+    stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
+    stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
+        "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
+
+    print(''.join([stat_string, header_string]))
+
+
 def convert_to_html(table):
    html = ''
    html = ''.join([html, '<table border="1">\n'])
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
    if f in ['csv', 'tsv']:
        import csv
        delimiter = ',' if f == 'csv' else '\t'
-        for page in sorted(data):
-            for table in range(len(data[page])):
-                dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
+        for page_number in sorted(data.keys()):
+            for table_number in sorted(data[page_number].keys()):
+                dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
                with open(os.path.join(output, dsvname), 'w') as outfile:
                    writer = csv.writer(
                        outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
-                    for row in data[page][table]:
+                    for row in data[page_number][table_number]['data']:
                        writer.writerow(row)
    elif f == 'html':
-        htmlname = '{}.html'.format(froot)
-        for page in sorted(data):
-            for table in range(len(data[page])):
+        htmlname = '{0}.html'.format(froot)
+        for page_number in sorted(data.keys()):
+            for table_number in sorted(data[page_number].keys()):
                with open(os.path.join(output, htmlname), 'a') as htmlfile:
-                    htmlfile.write(convert_to_html(data[page][table]))
+                    htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
    elif f == 'json':
        import json
-        with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
+        with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
                as jsonfile:
            json.dump(data, jsonfile)
    elif f == 'xlsx':
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
            from pyexcel_xlsx import save_data
            from collections import OrderedDict
            xlsx_data = OrderedDict()
-            for page in sorted(data):
-                for table in range(len(data[page])):
-                    sheet_name = '{0}_table_{1}'.format(page, table + 1)
+            for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
+                    sheet_name = ''.join([page_number, '_', table_number])
                    xlsx_data.update({sheet_name:
-                                      [row for row in data[page][table]]})
-            save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
+                                      [row for row in data[page_number][table_number]['data']]})
+            save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
        except ImportError:
            print("link to install docs")

@ -147,16 +304,17 @@ if __name__ == '__main__':
    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
    logname, __ = os.path.splitext(filename)
-    logname += '.log'
+    logname = ''.join([logname, '.log'])
+    scorename, __ = os.path.splitext(filename)
+    scorename = ''.join([scorename, '_info.csv'])
+    pngname, __ = os.path.splitext(filename)

    if args['--log']:
+        FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
        if args['--output']:
            logname = os.path.join(args['--output'], os.path.basename(logname))
-            logging.basicConfig(
-                filename=logname, filemode='w', level=logging.DEBUG)
-        else:
-            logging.basicConfig(
-                filename=logname, filemode='w', level=logging.DEBUG)
+        logging.basicConfig(
+            filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)

    p = []
    if args['--pages'] == '1':
@ -173,47 +331,142 @@ if __name__ == '__main__':
                else:
                    p.append({'start': int(r), 'end': int(r)})

+    margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
+        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
-            extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
-                                fill=args['--fill'],
-                                scale=int(args['--scale']),
-                                jtol=int(args['--jtol']),
-                                mtol=int(args['--mtol']),
-                                invert=args['--invert'],
-                                debug=args['--debug'],
-                                verbose=args['--verbose'])
-            data = extractor.get_tables()
+            manager = Pdf(Lattice(
+                                  fill=args['--fill'],
+                                  scale=int(args['--scale']),
+                                  invert=args['--invert'],
+                                  jtol=int(args['--jtol']),
+                                  mtol=int(args['--mtol']),
+                                  pdf_margin=margin_tuple,
+                                  debug=args['--debug']),
+                          filename,
+                          pagenos=p,
+                          parallel=args['--parallel'],
+                          clean=True)
+            data = manager.extract()
+            
+            processing_time = time.time() - start_time
+            vprint("Finished processing in", processing_time, "seconds")
+            logging.info("Finished processing in " + str(processing_time) + " seconds")
+
+            if args['--plot']:
+                if args['--output']:
+                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
+                plot_type = args['--plot'].split(',')
+                if 'page' in plot_type:
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            plot_table_barchart(table['r_nempty_cells'],
+                                table['c_nempty_cells'],
+                                table['empty_p'],
+                                page_number,
+                                table_number)
+
+                if 'all' in plot_type:
+                    plot_all_barchart(data, pngname)
+
+                if 'rc' in plot_type:
+                    plot_rc_piechart(data, pngname)
+            
+            if args['--summary']:
+                summary(data, processing_time)
+
+            if args['--save-info']:
+                if args['--output']:
+                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
+                with open(scorename, 'w') as score_file:
+                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
+                                ''.join([page_number, '_', table_number]),
+                                table['nrows'],
+                                table['ncols'],
+                                table['empty_p'],
+                                table['line_p'],
+                                table['text_p'],
+                                table['score']))
            if args['--debug']:
-                extractor.plot_geometry(args['--debug'])
+                manager.debug_plot()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
-            extractor = Stream(Pdf(filename, pagenos=p,
-                                   char_margin=float(args['--cmargin']),
-                                   line_margin=float(args['--lmargin']),
-                                   word_margin=float(args['--wmargin']),
-                                   clean=True),
-                               ncolumns=int(args['--ncols']),
-                               columns=args['--columns'],
-                               ytol=int(args['--ytol']),
-                               debug=args['--debug'],
-                               verbose=args['--verbose'])
-            data = extractor.get_tables()
+            manager = Pdf(Stream(
+                                 ncolumns=int(args['--ncols']),
+                                 columns=args['--columns'],
+                                 ytol=int(args['--ytol']),
+                                 mtol=int(args['--mtol']),
+                                 pdf_margin=margin_tuple,
+                                 debug=args['--debug']),
+                          filename,
+                          pagenos=p,
+                          parallel=args['--parallel'],
+                          clean=True)
+            data = manager.extract()
+            
+            processing_time = time.time() - start_time
+            vprint("Finished processing in", processing_time, "seconds")
+            logging.info("Finished processing in " + str(processing_time) + " seconds")
+            
+            if args['--plot']:
+                if args['--output']:
+                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
+                plot_type = args['--plot'].split(',')
+                if 'page' in plot_type:
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            plot_table_barchart(table['r_nempty_cells'],
+                                table['c_nempty_cells'],
+                                table['empty_p'],
+                                page_number,
+                                table_number)
+
+                if 'all' in plot_type:
+                    plot_all_barchart(data, pngname)
+
+                if 'rc' in plot_type:
+                    plot_rc_piechart(data, pngname)
+
+            if args['--summary']:
+                summary(data, processing_time)
+            
+            if args['--save-info']:
+                if args['--output']:
+                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
+                with open(scorename, 'w') as score_file:
+                    score_file.write('table,nrows,ncols,empty_p,,score\n')
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            score_file.write('{0},{1},{2},{3},{4}\n'.format(
+                                ''.join([page_number, '_', table_number]),
+                                table['nrows'],
+                                table['ncols'],
+                                table['empty_p'],
+                                table['score']))
+            
            if args['--debug']:
-                extractor.plot_text()
+                manager.debug_plot()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()

-    if data is None:
+    if args['--debug']:
        print("See 'camelot <method> -h' for various parameters you can tweak.")
    else:
        output = filedir if args['--output'] is None else args['--output']
        write_to_disk(data, f=args['--format'],
                      output=output, filename=filename)
-
-    vprint("finished in", time.time() - start_time, "seconds")
-    logging.info("Time taken: " + str(time.time() - start_time) + " seconds")