Add various metrics to score the quality of a parse

2016-08-30 14:52:49 +05:30 · 2016-08-30 14:52:49 +05:30 · 552f9cf422
parent 43a009dab4
commit 552f9cf422
11 changed files with 1027 additions and 472 deletions
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -1,18 +1,31 @@
-from __future__ import print_function
+from __future__ import division
 import os
 import types
 import copy_reg
 import logging
 import cv2
 import numpy as np
 from wand.image import Image
 from .table import Table
 from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
-                    get_row_index, get_column_index, reduce_index, outline,
+                    get_row_index, get_column_index, get_score, reduce_index,
-                    fill_spanning, remove_empty, encode_list)
+                    outline, fill_spanning, count_empty, encode_list, pdf_to_text)
 __all__ = ['Lattice']
 def _reduce_method(m):
    if m.im_self is None:
        return getattr, (m.im_class, m.im_func.func_name)
    else:
        return getattr, (m.im_self, m.im_func.func_name)
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _morph_transform(imagename, scale=15, invert=False):
    """Morphological Transformation
@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False):
    vertical = threshold
    horizontal = threshold
-    verticalsize = vertical.shape[0] / scale
+    verticalsize = vertical.shape[0] // scale
-    horizontalsize = horizontal.shape[1] / scale
+    horizontalsize = horizontal.shape[1] // scale
    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False):
    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
-    __, contours, __ = cv2.findContours(
+    try:
-        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        __, contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    tables = {}
@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False):
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        roi = joints[y : y + h, x : x + w]
-        __, jc, __ = cv2.findContours(
+        try:
-            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+            __, jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than <=4 joints
            continue
        joint_coords = []
@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False):
        tables[(x, y + h, x + w, y)] = joint_coords
    v_segments, h_segments = [], []
-    _, vcontours, _ = cv2.findContours(
+    try:
-        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        _, vcontours, _ = cv2.findContours(
            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        vcontours, _ = cv2.findContours(
            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
-    _, hcontours, _ = cv2.findContours(
+    try:
-        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        _, hcontours, _ = cv2.findContours(
            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        hcontours, _ = cv2.findContours(
            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
@ -160,24 +189,19 @@ class Lattice:
        page as value.
    """
-    def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2,
+    def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
-                 invert=False, debug=None, verbose=False):
+                 invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
-        self.pdfobject = pdfobject
+        self.method = 'lattice'
        self.fill = fill
        self.scale = scale
        self.jtol = jtol
        self.mtol = mtol
        self.invert = invert
        self.char_margin, self.line_margin, self.word_margin = pdf_margin
        self.debug = debug
        self.verbose = verbose
        self.tables = {}
        if self.debug is not None:
            self.debug_images = {}
            self.debug_segments = {}
            self.debug_tables = {}
-    def get_tables(self):
+    def get_tables(self, pdfname):
        """Returns all tables found in given pdf.
        Returns
@ -186,169 +210,124 @@ class Lattice:
            Dictionary with page number as key and list of tables on that
            page as value.
        """
-        vprint = print if self.verbose else lambda *a, **k: None
+        text, __, width, height = pdf_to_text(pdfname, self.char_margin,
-        self.pdfobject.split()
+            self.line_margin, self.word_margin)
-        self.pdfobject.convert()
+        bname, __ = os.path.splitext(pdfname)
-        for page in self.pdfobject.extract():
+        if not text:
-            p, text, __, width, height = page
+            logging.warning("{0}: PDF has no text. It may be an image.".format(
-            pkey = 'pg-{0}'.format(p)
+                os.path.basename(bname)))
-            imagename = os.path.join(
+            return None
-                self.pdfobject.temp, '{}.png'.format(pkey))
+        imagename = ''.join([bname, '.png'])
-            pdf_x = width
+        with Image(filename=pdfname, depth=8, resolution=300) as png:
-            pdf_y = height
+            png.save(filename=imagename)
-            img, table_bbox, v_segments, h_segments = _morph_transform(
+        pdf_x = width
-                imagename, scale=self.scale, invert=self.invert)
+        pdf_y = height
-            img_x = img.shape[1]
+        img, table_bbox, v_segments, h_segments = _morph_transform(
-            img_y = img.shape[0]
+            imagename, scale=self.scale, invert=self.invert)
-            scaling_factor_x = pdf_x / float(img_x)
+        img_x = img.shape[1]
-            scaling_factor_y = pdf_y / float(img_y)
+        img_y = img.shape[0]
        scaling_factor_x = pdf_x / float(img_x)
        scaling_factor_y = pdf_y / float(img_y)
-            if self.debug is not None:
+        if self.debug:
-                self.debug_images[pkey] = (img, table_bbox)
+            self.debug_images = (img, table_bbox)
-            factors = (scaling_factor_x, scaling_factor_y, img_y)
+        factors = (scaling_factor_x, scaling_factor_y, img_y)
-            table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
+        table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
-                                                           h_segments, factors)
+                                                       h_segments, factors)
-            if self.debug is not None:
+        if self.debug:
-                self.debug_segments[pkey] = (v_segments, h_segments)
+            self.debug_segments = (v_segments, h_segments)
            self.debug_tables = []
-            if self.debug is not None:
+        pdf_page = {}
-                debug_page_tables = []
+        page_tables = {}
-            page_tables = []
+        table_no = 1
-            # sort tables based on y-coord
+        # sort tables based on y-coord
-            for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
+        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
-                # select edges which lie within table_bbox
+            # select edges which lie within table_bbox
-                text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
+            table_info = {}
-                                                    h_segments)
+            text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
-                rotated = detect_vertical(text_bbox)
+                                                h_segments)
-                cols, rows = zip(*table_bbox[k])
+            table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
-                cols, rows = list(cols), list(rows)
+            rotated = detect_vertical(text_bbox)
-                cols.extend([k[0], k[2]])
+            cols, rows = zip(*table_bbox[k])
-                rows.extend([k[1], k[3]])
+            cols, rows = list(cols), list(rows)
-                # sort horizontal and vertical segments
+            cols.extend([k[0], k[2]])
-                cols = merge_close_values(sorted(cols), mtol=self.mtol)
+            rows.extend([k[1], k[3]])
-                rows = merge_close_values(
+            # sort horizontal and vertical segments
-                    sorted(rows, reverse=True), mtol=self.mtol)
+            cols = merge_close_values(sorted(cols), mtol=self.mtol)
-                # make grid using x and y coord of shortlisted rows and cols
+            rows = merge_close_values(
-                cols = [(cols[i], cols[i + 1])
+                sorted(rows, reverse=True), mtol=self.mtol)
-                        for i in range(0, len(cols) - 1)]
+            # make grid using x and y coord of shortlisted rows and cols
-                rows = [(rows[i], rows[i + 1])
+            cols = [(cols[i], cols[i + 1])
-                        for i in range(0, len(rows) - 1)]
+                    for i in range(0, len(cols) - 1)]
-                table = Table(cols, rows)
+            rows = [(rows[i], rows[i + 1])
-                # set table edges to True using ver+hor lines
+                    for i in range(0, len(rows) - 1)]
-                table = table.set_edges(v_s, h_s, jtol=self.jtol)
+            table = Table(cols, rows)
-                # set spanning cells to True
+            # set table edges to True using ver+hor lines
-                table = table.set_spanning()
+            table = table.set_edges(v_s, h_s, jtol=self.jtol)
-                # set table border edges to True
+            nouse = table.nocont_ / (len(v_s) + len(h_s))
-                table = outline(table)
+            table_info['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
            table = outline(table)
-                if self.debug is not None:
+            if self.debug:
-                    debug_page_tables.append(table)
+                self.debug_tables.append(table)
-                # fill text after sorting it
+            # fill text after sorting it
-                if rotated == '':
+            if rotated == '':
-                    text_bbox.sort(key=lambda x: (-x.y0, x.x0))
+                text_bbox.sort(key=lambda x: (-x.y0, x.x0))
-                elif rotated == 'left':
+            elif rotated == 'left':
-                    text_bbox.sort(key=lambda x: (x.x0, x.y0))
+                text_bbox.sort(key=lambda x: (x.x0, x.y0))
-                elif rotated == 'right':
+            elif rotated == 'right':
-                    text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
+                text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
                for t in text_bbox:
                    r_idx = get_row_index(t, rows)
                    c_idx = get_column_index(t, cols)
                    if None in [r_idx, c_idx]:
                        # couldn't assign LTChar to any cell
                        pass
                    else:
                        r_idx, c_idx = reduce_index(
                            table, rotated, r_idx, c_idx)
                        table.cells[r_idx][c_idx].add_text(
                            t.get_text().strip('\n'))
-                if self.fill is not None:
+            rerror = []
-                    table = fill_spanning(table, fill=self.fill)
+            cerror = []
-                ar = table.get_list()
+            for t in text_bbox:
-                if rotated == 'left':
+                try:
-                    ar = zip(*ar[::-1])
+                    r_idx, rass_error = get_row_index(t, rows)
-                elif rotated == 'right':
+                except TypeError:
-                    ar = zip(*ar[::1])
+                    # couldn't assign LTChar to any cell
-                    ar.reverse()
+                    continue
-                ar = remove_empty(ar)
+                try:
-                ar = [list(o) for o in ar]
+                    c_idx, cass_error = get_column_index(t, cols)
-                page_tables.append(encode_list(ar))
+                except TypeError:
-            vprint(pkey)
+                    # couldn't assign LTChar to any cell
-            self.tables[pkey] = page_tables
+                    continue
                rerror.append(rass_error)
                cerror.append(cass_error)
                r_idx, c_idx = reduce_index(
                    table, rotated, r_idx, c_idx)
                table.cells[r_idx][c_idx].add_text(
                    t.get_text().strip('\n'))
            score = get_score([[50, rerror], [50, cerror]])
            table_info['score'] = score
-        if self.debug is not None:
+            if self.fill is not None:
-            self.debug_tables[pkey] = debug_page_tables
+                table = fill_spanning(table, fill=self.fill)
            ar = table.get_list()
            if rotated == 'left':
                ar = zip(*ar[::-1])
            elif rotated == 'right':
                ar = zip(*ar[::1])
                ar.reverse()
            ar = encode_list(ar)
            table_info['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
            table_info['empty_p'] = empty_p
            table_info['r_nempty_cells'] = r_nempty_cells
            table_info['c_nempty_cells'] = c_nempty_cells
            table_info['nrows'] = len(ar)
            table_info['ncols'] = len(ar[0])
            page_tables['table_{0}'.format(table_no)] = table_info
            table_no += 1
        pdf_page[os.path.basename(bname)] = page_tables
-        if self.pdfobject.clean:
+        if self.debug:
            self.pdfobject.remove_tempdir()
        if self.debug is not None:
            return None
-        return self.tables
+        return pdf_page
    def plot_geometry(self, geometry):
        """Plots various pdf geometries that are detected so user can choose
        tweak scale, jtol, mtol parameters.
        """
        import matplotlib.pyplot as plt
        if geometry == 'contour':
            for pkey in self.debug_images.keys():
                img, table_bbox = self.debug_images[pkey]
                for t in table_bbox.keys():
                    cv2.rectangle(img, (t[0], t[1]),
                                  (t[2], t[3]), (255, 0, 0), 3)
                plt.imshow(img)
                plt.show()
        elif geometry == 'joint':
            x_coord = []
            y_coord = []
            for pkey in self.debug_images.keys():
                img, table_bbox = self.debug_images[pkey]
                for k in table_bbox.keys():
                    for coord in table_bbox[k]:
                        x_coord.append(coord[0])
                        y_coord.append(coord[1])
                max_x, max_y = max(x_coord), max(y_coord)
                plt.plot(x_coord, y_coord, 'ro')
                plt.axis([0, max_x + 100, max_y + 100, 0])
                plt.imshow(img)
                plt.show()
        elif geometry == 'line':
            for pkey in self.debug_segments.keys():
                v_s, h_s = self.debug_segments[pkey]
                for v in v_s:
                    plt.plot([v[0], v[2]], [v[1], v[3]])
                for h in h_s:
                    plt.plot([h[0], h[2]], [h[1], h[3]])
                plt.show()
        elif geometry == 'table':
            for pkey in self.debug_tables.keys():
                for table in self.debug_tables[pkey]:
                    for i in range(len(table.cells)):
                        for j in range(len(table.cells[i])):
                            if table.cells[i][j].left:
                                plt.plot([table.cells[i][j].lb[0],
                                          table.cells[i][j].lt[0]],
                                         [table.cells[i][j].lb[1],
                                          table.cells[i][j].lt[1]])
                            if table.cells[i][j].right:
                                plt.plot([table.cells[i][j].rb[0],
                                          table.cells[i][j].rt[0]],
                                         [table.cells[i][j].rb[1],
                                          table.cells[i][j].rt[1]])
                            if table.cells[i][j].top:
                                plt.plot([table.cells[i][j].lt[0],
                                          table.cells[i][j].rt[0]],
                                         [table.cells[i][j].lt[1],
                                          table.cells[i][j].rt[1]])
                            if table.cells[i][j].bottom:
                                plt.plot([table.cells[i][j].lb[0],
                                          table.cells[i][j].rb[0]],
                                         [table.cells[i][j].lb[1],
                                          table.cells[i][j].rb[1]])
                plt.show()
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -1,18 +1,11 @@
 import os
 import shutil
 import tempfile
 import itertools
 import multiprocessing as mp
 import cv2
 from PyPDF2 import PdfFileReader, PdfFileWriter
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
 from wand.image import Image
 __all__ = ['Pdf']
@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos):
    return page_numbers
 def _extract_text_objects(layout, LTObject, t=None):
    """Recursively parses pdf layout to get a list of
    text objects.
    Parameters
    ----------
    layout : object
        Layout object.
    LTObject : object
        Text object, either LTChar or LTTextLineHorizontal.
    t : list (optional, default: None)
    Returns
    -------
    t : list
        List of text objects.
    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
                t += _extract_text_objects(obj, LTObject)
    except AttributeError:
        pass
    return t
 class Pdf:
    """Handles all pdf operations which include:
@ -99,66 +60,163 @@ class Pdf:
        is greater than word_margin. (optional, default: 0.1)
    """
-    def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}],
+    def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}],
-                 char_margin=2.0, line_margin=0.5, word_margin=0.1,
+                 parallel=False, clean=False):
                 clean=False):
        self.extractor = extractor
        self.pdfname = pdfname
        if not self.pdfname.endswith('.pdf'):
            raise TypeError("Only PDF format is supported right now.")
        self.pagenos = _parse_page_numbers(pagenos)
-        self.char_margin = char_margin
+        self.parallel = parallel
-        self.line_margin = line_margin
+        self.cpu_count = mp.cpu_count()
-        self.word_margin = word_margin
+        self.pool = mp.Pool(processes=self.cpu_count)
        self.clean = clean
        self.temp = tempfile.mkdtemp()
    def split(self):
        """Splits pdf into single page pdfs.
        """
        if not self.pdfname.endswith('.pdf'):
            raise TypeError("Only PDF format is supported.")
        infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False)
        for p in self.pagenos:
            page = infile.getPage(p - 1)
            outfile = PdfFileWriter()
            outfile.addPage(page)
-            with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f:
+            with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f:
                outfile.write(f)
    def remove_tempdir(self):
        shutil.rmtree(self.temp)
    def extract(self):
        """Extracts text objects, width, height from a pdf.
        """
-        for p in self.pagenos:
+        self.split()
-            pkey = 'pg-{0}'.format(p)
+        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
-            pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
+                 for p in self.pagenos]
-            with open(pname, 'r') as f:
+        if self.parallel:
-                parser = PDFParser(f)
+            tables = self.pool.map(self.extractor.get_tables, pages)
-                document = PDFDocument(parser)
+            tables = {k: v for d in tables if d is not None for k, v in d.items()}
-                if not document.is_extractable:
+        else:
-                    raise PDFTextExtractionNotAllowed
+            tables = {}
-                laparams = LAParams(char_margin=self.char_margin,
+            if self.extractor.debug:
-                                    line_margin=self.line_margin,
+                if self.extractor.method == 'stream':
-                                    word_margin=self.word_margin)
+                    self.debug = self.extractor.debug
-                rsrcmgr = PDFResourceManager()
+                    self.debug_text = []
-                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+                elif self.extractor.method == 'lattice':
-                interpreter = PDFPageInterpreter(rsrcmgr, device)
+                    self.debug = self.extractor.debug
-                for page in PDFPage.create_pages(document):
+                    self.debug_images = []
-                    interpreter.process_page(page)
+                    self.debug_segments = []
-                    layout = device.get_result()
+                    self.debug_tables = []
-                    lattice_objects = _extract_text_objects(layout, LTChar)
+            for p in pages:
-                    stream_objects = _extract_text_objects(
+                table = self.extractor.get_tables(p)
-                        layout, LTTextLineHorizontal)
+                if table is not None:
-                    width = layout.bbox[2]
+                    tables.update(table)
-                    height = layout.bbox[3]
+                if self.extractor.debug:
-                yield p, lattice_objects, stream_objects, width, height
+                    if self.extractor.method == 'stream':
                        self.debug_text.append(self.extractor.debug_text)
                    elif self.extractor.method == 'lattice':
                        self.debug_images.append(self.extractor.debug_images)
                        self.debug_segments.append(self.extractor.debug_segments)
                        self.debug_tables.append(self.extractor.debug_tables)
        if self.clean:
            self.remove_tempdir()
        return tables
-    def convert(self):
+    def debug_plot(self):
-        """Converts single page pdfs to images.
+        """Plots all text objects and various pdf geometries so that
        user can choose number of columns, columns x-coordinates for
        Stream or tweak Lattice parameters (scale, jtol, mtol).
        """
-        for p in self.pagenos:
+        import matplotlib.pyplot as plt
-            pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p))
+        import matplotlib.patches as patches
            imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p))
            with Image(filename=pdfname, depth=8, resolution=300) as png:
                png.save(filename=imagename)
-    def remove_tempdir(self):
+        if self.debug is True:
-        shutil.rmtree(self.temp)
+            try:
                for text in self.debug_text:
                    fig = plt.figure()
                    ax = fig.add_subplot(111, aspect='equal')
                    xs, ys = [], []
                    for t in text:
                        xs.extend([t[0], t[1]])
                        ys.extend([t[2], t[3]])
                        ax.add_patch(
                            patches.Rectangle(
                                (t[0], t[1]),
                                t[2] - t[0],
                                t[3] - t[1]
                            )
                        )
                    ax.set_xlim(min(xs) - 10, max(xs) + 10)
                    ax.set_ylim(min(ys) - 10, max(ys) + 10)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Stream.")
        elif self.debug == 'contour':
            try:
                for img, table_bbox in self.debug_images:
                    for t in table_bbox.keys():
                        cv2.rectangle(img, (t[0], t[1]),
                                      (t[2], t[3]), (255, 0, 0), 3)
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'joint':
            try:
                for img, table_bbox in self.debug_images:
                    x_coord = []
                    y_coord = []
                    for k in table_bbox.keys():
                        for coord in table_bbox[k]:
                            x_coord.append(coord[0])
                            y_coord.append(coord[1])
                    max_x, max_y = max(x_coord), max(y_coord)
                    plt.plot(x_coord, y_coord, 'ro')
                    plt.axis([0, max_x + 100, max_y + 100, 0])
                    plt.imshow(img)
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'line':
            try:
                for v_s, h_s in self.debug_segments:
                    for v in v_s:
                        plt.plot([v[0], v[2]], [v[1], v[3]])
                    for h in h_s:
                        plt.plot([h[0], h[2]], [h[1], h[3]])
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        elif self.debug == 'table':
            try:
                for tables in self.debug_tables:
                    for table in tables:
                        for i in range(len(table.cells)):
                            for j in range(len(table.cells[i])):
                                if table.cells[i][j].left:
                                    plt.plot([table.cells[i][j].lb[0],
                                              table.cells[i][j].lt[0]],
                                             [table.cells[i][j].lb[1],
                                              table.cells[i][j].lt[1]])
                                if table.cells[i][j].right:
                                    plt.plot([table.cells[i][j].rb[0],
                                              table.cells[i][j].rt[0]],
                                             [table.cells[i][j].rb[1],
                                              table.cells[i][j].rt[1]])
                                if table.cells[i][j].top:
                                    plt.plot([table.cells[i][j].lt[0],
                                              table.cells[i][j].rt[0]],
                                             [table.cells[i][j].lt[1],
                                              table.cells[i][j].rt[1]])
                                if table.cells[i][j].bottom:
                                    plt.plot([table.cells[i][j].lb[0],
                                              table.cells[i][j].rb[0]],
                                             [table.cells[i][j].lb[1],
                                              table.cells[i][j].rb[1]])
                    plt.show()
            except AttributeError:
                raise ValueError("This option only be used with Lattice.")
        else:
            raise UserWarning("This method can only be called after"
                " debug has been specified.")
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -1,14 +1,26 @@
-from __future__ import print_function
+from __future__ import division
 import os
 import types
 import copy_reg
 import logging
 import numpy as np
-from .utils import get_column_index, encode_list
+from .table import Table
 from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
 __all__ = ['Stream']
 def _reduce_method(m):
    if m.im_self is None:
        return getattr, (m.im_class, m.im_func.func_name)
    else:
        return getattr, (m.im_self, m.im_func.func_name)
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _group_rows(text, ytol=2):
    """Groups text objects into rows using ytol.
@ -35,14 +47,16 @@ def _group_rows(text, ytol=2):
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(row_y, t.y0, atol=ytol):
-                row_y = t.y0
+                rows.append(sorted(temp, key=lambda t: t.x0))
                rows.append(temp)
                temp = []
                row_y = t.y0
            temp.append(t)
    rows.append(sorted(temp, key=lambda t: t.x0))
    __ = rows.pop(0) # hacky
    return rows
-def _merge_columns(l):
+def _merge_columns(l, mtol=2):
    """Merges overlapping columns and returns list with updated
    columns boundaries.
@ -62,7 +76,8 @@ def _merge_columns(l):
            merged.append(higher)
        else:
            lower = merged[-1]
-            if higher[0] <= lower[1]:
+            if (higher[0] <= lower[1] or
                    np.isclose(higher[0], lower[1], atol=mtol)):
                upper_bound = max(lower[1], higher[1])
                lower_bound = min(lower[0], higher[0])
                merged[-1] = (lower_bound, upper_bound)
@ -71,6 +86,62 @@ def _merge_columns(l):
    return merged
 def _get_column_index(t, columns):
    """Gets index of the column in which the given object falls by
    comparing their co-ordinates.
    Parameters
    ----------
    t : object
    columns : list
    Returns
    -------
    c : int
    """
    offset1, offset2 = 0, 0
    lt_col_overlap = []
    for c in columns:
        if c[0] <= t.x1 and c[1] >= t.x0:
            left = t.x0 if c[0] <= t.x0 else c[0]
            right = t.x1 if c[1] >= t.x1 else c[1]
            lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1]))
        else:
            lt_col_overlap.append(-1)
    if len(filter(lambda x: x != -1, lt_col_overlap)) == 0:
        logging.warning("Text doesn't fit any column.")
    c_idx = lt_col_overlap.index(max(lt_col_overlap))
    if t.x0 < columns[c_idx][0]:
        offset1 = abs(t.x0 - columns[c_idx][0])
    if t.x1 > columns[c_idx][1]:
        offset2 = abs(t.x1 - columns[c_idx][1])
    Y = abs(t.y0 - t.y1)
    charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1)
    error = (Y * (offset1 + offset2)) / charea
    return c_idx, error
 def _add_columns(cols, text, ytolerance):
    if text:
        text = _group_rows(text, ytol=ytolerance)
        elements = [len(r) for r in text]
        new_cols = [(t.x0, t.x1)
            for r in text if len(r) == max(elements) for t in r]
        cols.extend(_merge_columns(sorted(new_cols)))
    return cols
 def _join_columns(cols, width):
    cols = sorted(cols)
    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
    cols.insert(0, 0)
    cols.append(width) # or some tolerance
    cols = [(cols[i], cols[i + 1])
            for i in range(0, len(cols) - 1)]
    return cols
 class Stream:
    """Stream algorithm
@ -105,20 +176,18 @@ class Stream:
        page as value.
    """
-    def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2,
+    def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
-                 debug=False, verbose=False):
+                 pdf_margin=(2.0, 0.5, 0.1), debug=False):
-        self.pdfobject = pdfobject
+        self.method = 'stream'
        self.ncolumns = ncolumns
        self.columns = columns
        self.ytol = ytol
        self.mtol = mtol
        self.char_margin, self.line_margin, self.word_margin = pdf_margin
        self.debug = debug
        self.verbose = verbose
        self.tables = {}
        if self.debug:
            self.debug_text = {}
-    def get_tables(self):
+    def get_tables(self, pdfname):
        """Returns all tables found in given pdf.
        Returns
@ -127,86 +196,112 @@ class Stream:
            Dictionary with page number as key and list of tables on that
            page as value.
        """
-        vprint = print if self.verbose else lambda *a, **k: None
+        __, text, width, height = pdf_to_text(pdfname, self.char_margin,
-        self.pdfobject.split()
+            self.line_margin, self.word_margin)
-        for page in self.pdfobject.extract():
+        bname, __ = os.path.splitext(pdfname)
-            p, __, text, __, __ = page
+        if not text:
-            pkey = 'pg-{0}'.format(p)
+            logging.warning("{0}: PDF has no text. It may be an image.".format(
-            text.sort(key=lambda x: (-x.y0, x.x0))
+                os.path.basename(bname)))
-
+            return None
-            if self.debug:
+        text.sort(key=lambda x: (-x.y0, x.x0))
                self.debug_text[pkey] = text
            rows = _group_rows(text, ytol=self.ytol)
            elements = [len(r) for r in rows]
            # a table can't have just 1 column, can it?
            elements = filter(lambda x: x != 1, elements)
            guess = False
            if self.columns:
                cols = self.columns.split(',')
                cols = [(float(cols[i]), float(cols[i + 1]))
                        for i in range(0, len(cols) - 1)]
            else:
                guess = True
                ncols = self.ncolumns if self.ncolumns else max(
                    set(elements), key=elements.count)
                if ncols == 0:
                    # no tables detected
                    continue
                cols = [(t.x0, t.x1)
                        for r in rows for t in r if len(r) == ncols]
                cols = _merge_columns(sorted(cols))
                cols = [(c[0] + c[1]) / 2.0 for c in cols]
            ar = [['' for c in cols] for r in rows]
            for r_idx, r in enumerate(rows):
                for t in r:
                    if guess:
                        cog = (t.x0 + t.x1) / 2.0
                        diff = [abs(cog - c) for c in cols]
                        c_idx = diff.index(min(diff))
                    else:
                        c_idx = get_column_index(t, cols)
                    if None in [r_idx, c_idx]:  # couldn't assign LTTextLH to any cell
                        continue
                    if ar[r_idx][c_idx]:
                        ar[r_idx][c_idx] = ' '.join(
                            [ar[r_idx][c_idx], t.get_text().strip()])
                    else:
                        ar[r_idx][c_idx] = t.get_text().strip()
            vprint(pkey)
            self.tables[pkey] = [encode_list(ar)]
        if self.pdfobject.clean:
            self.pdfobject.remove_tempdir()
        if self.debug:
            self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
            return None
-        return self.tables
+        rows_grouped = _group_rows(text, ytol=self.ytol)
        elements = [len(r) for r in rows_grouped]
        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                    if len(r) > 0 else 0 for r in rows_grouped]
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
        rows.insert(0, height) # or some tolerance
        rows.append(0)
        rows = [(rows[i], rows[i + 1])
                for i in range(0, len(rows) - 1)]
-    def plot_text(self):
+        guess = False
-        """Plots all text objects so user can choose number of columns
+        if self.columns:
-        or columns x-coordinates using the matplotlib interface.
+            # user has to input boundary columns too
-        """
+            # take (0, width) by default
-        import matplotlib.pyplot as plt
+            # similar to else condition
-        import matplotlib.patches as patches
+            # len can't be 1
            cols = self.columns.split(',')
            cols = [(float(cols[i]), float(cols[i + 1]))
                    for i in range(0, len(cols) - 1)]
        else:
            if self.ncolumns:
                ncols = self.ncolumns
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
                cols = _merge_columns(sorted(cols), mtol=self.mtol)
                if len(cols) != self.ncolumns:
                    logging.warning("{}: The number of columns after merge"
                                  " isn't the same as what you specified."
                                  " Change the value of mtol.".format(
                                  os.path.basename(bname)))
                cols = _join_columns(cols, width)
            else:
                guess = True
                ncols = max(set(elements), key=elements.count)
                len_non_mode = len(filter(lambda x: x != ncols, elements))
                if ncols == 1 and not self.debug:
                    # no tables detected
                    logging.warning("{}: Only one column was detected, the PDF"
                                  " may have no tables. Specify ncols if"
                                  " the PDF has tables.".format(
                                  os.path.basename(bname)))
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
                cols = _merge_columns(sorted(cols), mtol=self.mtol)
                inner_text = []
                for i in range(1, len(cols)):
                    left = cols[i - 1][1]
                    right = cols[i][0]
                    inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
                outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                inner_text.extend(outer_text)
                cols = _add_columns(cols, inner_text, self.ytol)
                cols = _join_columns(cols, width)
-        for pkey in sorted(self.debug_text.keys()):
+        pdf_page = {}
-            fig = plt.figure()
+        page_tables = {}
-            ax = fig.add_subplot(111, aspect='equal')
+        table_info = {}
-            xs, ys = [], []
+        table = Table(cols, rows)
-            for t in self.debug_text[pkey]:
+        rerror = []
-                xs.extend([t.x0, t.x1])
+        cerror = []
-                ys.extend([t.y0, t.y1])
+        for row in rows_grouped:
-                ax.add_patch(
+            for t in row:
-                    patches.Rectangle(
+                try:
-                        (t.x0, t.y0),
+                    r_idx, rass_error = get_row_index(t, rows)
-                        t.x1 - t.x0,
+                except ValueError as e:
-                        t.y1 - t.y0
+                    # couldn't assign LTTextLH to any cell
-                    )
+                    vprint(e.message)
-                )
+                    continue
-            ax.set_xlim(min(xs) - 10, max(xs) + 10)
+                try:
-            ax.set_ylim(min(ys) - 10, max(ys) + 10)
+                    c_idx, cass_error = _get_column_index(t, cols)
-            plt.show()
+                except ValueError as e:
                    # couldn't assign LTTextLH to any cell
                    vprint(e.message)
                    continue
                rerror.append(rass_error)
                cerror.append(cass_error)
                table.cells[r_idx][c_idx].add_text(
                    t.get_text().strip('\n'))
        if guess:
            score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
        else:
            score = get_score([[50, rerror], [50, cerror]])
        table_info['score'] = score
        ar = table.get_list()
        ar = encode_list(ar)
        table_info['data'] = ar
        empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
        table_info['empty_p'] = empty_p
        table_info['r_nempty_cells'] = r_nempty_cells
        table_info['c_nempty_cells'] = c_nempty_cells
        table_info['nrows'] = len(ar)
        table_info['ncols'] = len(ar[0])
        page_tables['table_1'] = table_info
        pdf_page[os.path.basename(bname)] = page_tables
        return pdf_page
--- a/camelot/table.py
+++ b/camelot/table.py
@ -26,6 +26,7 @@ class Table:
        self.rows = rows
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in cols] for r in rows]
        self.nocont_ = 0
    def set_edges(self, vertical, horizontal, jtol=2):
        """Sets cell edges to True if corresponding line segments
@ -53,6 +54,7 @@ class Table:
            k = [k for k, t in enumerate(self.rows)
                 if np.isclose(v[1], t[0], atol=jtol)]
            if not j:
                self.nocont_ += 1
                continue
            J = j[0]
            if i == [0]:  # only left edge
@ -104,6 +106,7 @@ class Table:
            k = [k for k, t in enumerate(self.cols)
                 if np.isclose(h[2], t[0], atol=jtol)]
            if not j:
                self.nocont_ += 1
                continue
            J = j[0]
            if i == [0]:  # only top edge
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,5 +1,18 @@
 from __future__ import division
 import os
 import numpy as np
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
 def translate(x1, x2):
    """Translates x2 by x1.
@ -243,15 +256,24 @@ def get_row_index(t, rows):
    ----------
    t : object
-    rows : list
+    rows : list, sorted in decreasing order
    Returns
    -------
    r : int
    """
    offset1, offset2 = 0, 0
    for r in range(len(rows)):
        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
-            return r
+            if t.y0 > rows[r][0]:
                offset1 = abs(t.y0 - rows[r][0])
            if t.y1 < rows[r][1]:
                offset2 = abs(t.y1 - rows[r][1])
            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
            charea = X * Y
            error = (X * (offset1 + offset2)) / charea
            return r, error
 def get_column_index(t, columns):
@ -268,9 +290,45 @@ def get_column_index(t, columns):
    -------
    c : int
    """
    offset1, offset2 = 0, 0
    for c in range(len(columns)):
        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
-            return c
+            if t.x0 < columns[c][0]:
                offset1 = abs(t.x0 - columns[c][0])
            if t.x1 > columns[c][1]:
                offset2 = abs(t.x1 - columns[c][1])
            X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
            Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
            charea = X * Y
            error = (Y * (offset1 + offset2)) / charea
            return c, error
 def get_score(error_weights):
    """Calculates score based on weights assigned to various parameters,
    and their error percentages.
    Parameters
    ----------
    error_weights : dict
        Dict with a tuple of error percentages as key and weightage
        assigned to them as value. Sum of all values should be equal
        to 100.
    Returns
    -------
    score : float
    """
    SCORE_VAL = 100
    score = 0
    if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
        raise ValueError("Please assign a valid weightage to each parameter"
                         " such that their sum is equal to 100")
    for ew in error_weights:
        weight = ew[0] / len(ew[1])
        for error_percentage in ew[1]:
            score += weight * (1 - error_percentage)
    return score
 def reduce_index(t, rotated, r_idx, c_idx):
@ -394,6 +452,110 @@ def remove_empty(d):
    return d
 def count_empty(d):
    """Counts empty rows and columns from list of lists.
    Parameters
    ----------
    d : list
    Returns
    -------
    n_empty_rows : number of empty rows
    n_empty_cols : number of empty columns
    empty_p : percentage of empty cells
    """
    empty_p = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
        for j in i:
            if j.strip() == '':
                empty_p += 1
    empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
    for row in d:
        r_nempty_c = 0
        for r in row:
            if r.strip() != '':
                r_nempty_c += 1
        r_nempty_cells.append(r_nempty_c)
    d = zip(*d)
    d = [list(col) for col in d]
    for col in d:
        c_nempty_c = 0
        for c in col:
            if c.strip() != '':
                c_nempty_c += 1
        c_nempty_cells.append(c_nempty_c)
    return empty_p, r_nempty_cells, c_nempty_cells
 def encode_list(ar):
    """Encodes list of text.
    Parameters
    ----------
    ar : list
    Returns
    -------
    ar : list
    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar
 def extract_text_objects(layout, LTObject, t=None):
    """Recursively parses pdf layout to get a list of
    text objects.
    Parameters
    ----------
    layout : object
        Layout object.
    LTObject : object
        Text object, either LTChar or LTTextLineHorizontal.
    t : list (optional, default: None)
    Returns
    -------
    t : list
        List of text objects.
    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
                t += extract_text_objects(obj, LTObject)
    except AttributeError:
        pass
    return t
 def pdf_to_text(pname, char_margin, line_margin, word_margin):
    # pkey = 'page-{0}'.format(p)
    # pname = os.path.join(self.temp, '{}.pdf'.format(pkey))
    with open(pname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams(char_margin=char_margin,
                            line_margin=line_margin,
                            word_margin=word_margin)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            lattice_objects = extract_text_objects(layout, LTChar)
            stream_objects = extract_text_objects(
                layout, LTTextLineHorizontal)
            width = layout.bbox[2]
            height = layout.bbox[3]
        return lattice_objects, stream_objects, width, height
--- a/docs/index.rst
+++ b/docs/index.rst
@ -39,7 +39,7 @@ Usage
    >>> extractor = Lattice(Pdf('us-030.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
--- a/docs/lattice.rst
+++ b/docs/lattice.rst
@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x
    >>> extractor = Lattice(Pdf('us-030.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill`
    >>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
   :header: "Plan Type","County","Plan  Name","Totals"
@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S
    >>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
   :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV"
--- a/docs/stream.rst
+++ b/docs/stream.rst
@ -17,7 +17,7 @@ Let's run it on this PDF.
    >>> extractor = Stream(Pdf('eu-027.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. .. _this: insert link for eu-027.pdf
@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case.
    >>> extractor = Stream(Pdf('missing_values.pdf'))
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. .. _this: insert link for missing_values.pdf
@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last
    >>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5)
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this.
    >>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700')
    >>> tables = extractor.get_tables()
-    >>> print tables['pg-1']
+    >>> print tables['page-1'][0]
 .. csv-table::
--- a/tests/test_lattice.py
+++ b/tests/test_lattice.py
@ -26,7 +26,7 @@ def test_lattice_basic():
    extractor = Lattice(Pdf(pdfname,
                            pagenos=[{'start': 2, 'end': 2}], clean=True))
    tables = extractor.get_tables()
-    assert_equal(tables['pg-2'][0], data)
+    assert_equal(tables['page-2'][0], data)
 def test_lattice_fill():
@ -76,7 +76,7 @@ def test_lattice_fill():
    pdfname = os.path.join(testdir, 'row_span_1.pdf')
    extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['pagea-1'][0], data)
 def test_lattice_invert():
@ -94,4 +94,4 @@ def test_lattice_invert():
    pdfname = os.path.join(testdir, 'lines_in_background_1.pdf')
    extractor = Lattice(Pdf(pdfname, clean=True), invert=True)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][1], data)
+    assert_equal(tables['page-1'][1], data)
--- a/tests/test_stream.py
+++ b/tests/test_stream.py
@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__))
 def test_stream_basic():
    data = [
-        ["","","","",""],
+        ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
-        ["C Appendix C: Summary Statistics","","","",""],
+        ["Entidad","","Municipio","","Localidad",""],
-        ["","Table C1: Summary Statistics","","",""],
+        ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
-        ["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""],
+        ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
-        ["Variable","Mean","Std. Dev. Min","","Max"],
+        ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"],
-        ["Age","50.8","15.9","21","90"],
+        ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"],
-        ["Men","0.47","0.50","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"],
-        ["East","0.28","0.45","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"],
-        ["Rural","0.15","0.36","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"],
-        ["Married","0.57","0.50","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"],
-        ["Single","0.21","0.40","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"],
-        ["Divorced","0.13","0.33","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"],
-        ["Widowed","0.08","0.26","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"],
-        ["Separated","0.03","0.16","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"],
-        ["Partner","0.65","0.48","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"],
-        ["Employed","0.55","0.50","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"],
-        ["Fulltime","0.34","0.47","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"],
-        ["Parttime","0.20","0.40","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"],
-        ["Unemployed","0.08","0.28","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"],
-        ["Homemaker","0.19","0.40","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"],
-        ["Retired","0.28","0.45","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"],
-        ["Household size","2.43","1.22","1","9"],
+        ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"],
-        ["Households with children","0.37","0.48","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"],
-        ["Number of children","1.67","1.38","0","8"],
+        ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"],
-        ["Lower secondary education","0.08","0.27","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"],
-        ["Upper secondary education","0.60","0.49","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"],
-        ["Post secondary, non tert. education","0.12","0.33","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"],
-        ["First stage tertiary education","0.17","0.38","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"],
-        ["Other education","0.03","0.17","0","1"],
+        ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"],
-        ["Household income (Euro/month)","2,127","1,389","22","22,500"],
+        ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"],
-        ["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"],
+        ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"],
-        ["Gross ﬁnancial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"],
+        ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"],
-        ["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""],
+        ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"],
-        ["","","","","ECB"],
+        ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"],
-        ["","","","","Working Paper Series No 1299"],
+        ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"],
-        ["","","","","Febuary 2011"]
+        ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"],
        ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"],
        ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"],
        ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
        ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
        ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
        ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
    ]
-    pdfname = os.path.join(testdir,
+    pdfname = os.path.join(testdir, 'mexican_towns.pdf')
-        "tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf")
+    extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}],
    extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}],
                           clean=True))
    tables = extractor.get_tables()
-    assert_equal(tables['pg-3'][0], data)
+    assert_equal(tables['page-1'][0], data)
 def test_stream_ncolumns():
    data = [
-        ["","","","",""],
+        ["Bhandara - Key Indicators","","","",""],
        ["","Bhandara - Key Indicators","","",""],
        ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""],
        ["Indicators","TOTAL","RURAL","TOTAL","RURAL"],
        ["Reported Prevalence of Morbidity","","","",""],
@ -105,21 +110,20 @@ def test_stream_ncolumns():
        ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""],
        ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""],
        ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""],
-        ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""]
+        ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""],
        ["","4","","",""]
    ]
    pdfname = os.path.join(testdir, 'missing_values.pdf')
    extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True),
                       ncolumns=5)
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['page-1'][0], data)
 def test_stream_columns():
    data = [
-        ["","","","","",""],
+        ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"],
        ["Clave","","Clave","","Clave",""],
        ["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"],
        ["Entidad","","Municipio","","Localidad",""],
        ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"],
        ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"],
@ -160,10 +164,11 @@ def test_stream_columns():
        ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"],
        ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"],
        ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"],
-        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"]
+        ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"],
        ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"],
    ]
    pdfname = os.path.join(testdir, 'mexican_towns.pdf')
    extractor = Stream(Pdf(pdfname, clean=True),
                       columns='28,67,180,230,425,475,700')
    tables = extractor.get_tables()
-    assert_equal(tables['pg-1'][0], data)
+    assert_equal(tables['page-1'][0], data)
--- a/tools/camelot
+++ b/tools/camelot
@ -4,8 +4,12 @@ import os
 import sys
 import time
 import logging
 import warnings
 import numpy as np
 from docopt import docopt
 from collections import Counter
 import matplotlib.pyplot as plt
 from PyPDF2 import PdfFileReader
 from camelot.pdf import Pdf
@ -22,12 +26,23 @@ usage:
 options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -V, --verbose             Verbose.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -P, --parallel            Parallelize the parsing process.
 -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
- -l, --log                 Print log to file.
+ -l, --log                 Log to file.
 -V, --verbose             Verbose.
 -o, --output <directory>  Output directory.
 -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
                           grouped together to form a word. [default: 2.0]
 -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
                           grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
                           if distance between words is greater than word
                           margin. [default: 0.1]
 -S, --save-info           Save parsing info for each page to a file.
 -X, --plot <dist>         Plot distributions. (page,all,rc)
 -Z, --summary             Summarize metrics.
 camelot methods:
 lattice  Looks for lines between data.
@ -47,12 +62,12 @@ options:
                      cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
 """
@ -69,17 +84,159 @@ options:
                          Example: -c 10.1,20.2,30.3
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
- -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
+ -m, --mtol <mtol>        Tolerance to account for when merging columns
-                          grouped together to form a word. [default: 2.0]
+                          together. [default: 2]
 -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
                          grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
                          if distance between words is greater than word
                          margin. [default: 0.1]
 -d, --debug              Debug by visualizing textboxes.
 """
 def plot_table_barchart(r, c, p, pno, tno):
    row_idx = [i + 1 for i, row in enumerate(r)]
    col_idx = [i + 1 for i, col in enumerate(c)]
    r_index = np.arange(len(r))
    c_index = np.arange(len(c))
    width = 0.7
    plt.figure(figsize=(8, 6))
    plt.subplot(2, 1, 1)
    plt.title('Percentage of empty cells in table: {0:.2f}'.format(p))
    plt.xlabel('row index')
    plt.ylabel('number of non-empty cells in row')
    plt.bar(r_index, r)
    plt.xticks(r_index + width * 0.5, row_idx)
    plt.ylim(0, len(c))
    plt.subplot(2, 1, 2)
    plt.xlabel('column index')
    plt.ylabel('number of non-empty cells in column')
    plt.bar(c_index, c)
    plt.xticks(c_index + width * 0.5, col_idx)
    plt.ylim(0, len(r))
    plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300)
 def plot_all_barchart(data, output):
    r_empty_cells = []
    for page_number in data.keys():
        page = data[page_number]
        for table_number in page.keys():
            table = page[table_number]
            r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']])
    c = Counter(r_empty_cells)
    if 0.0 not in c:
        c.update({0.0: 0})
    if 1.0 not in c:
        c.update({1.0: 0})
    plt.figure(figsize=(8, 6))
    plt.xlabel('percentage of non-empty cells in a row')
    plt.ylabel('percentage of rows processed')
    row_p = [count / float(sum(c.values())) for count in c.values()]
    plt.bar(c.keys(), row_p, align='center', width=0.05)
    plt.ylim(0, 1.0)
    plt.savefig(''.join([output, '_all.png']), dpi=300)
 def plot_rc_piechart(data, output):
    from matplotlib import cm
    tables = 0
    rows, cols = [], []
    for page_number in data.keys():
        page = data[page_number]
        for table_number in page.keys():
            table = page[table_number]
            tables += 1
            rows.append(table['nrows'])
            cols.append(table['ncols'])
    r = Counter(rows)
    c = Counter(cols)
    plt.figure(figsize=(8, 6))
    cs1 = cm.Set1(np.arange(len(r)) / float(len(r)))
    ax1 = plt.subplot(211, aspect='equal')
    ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90)
    ax1.set_title('row distribution across tables')
    cs2 = cm.Set1(np.arange(len(c)) / float(len(c)))
    ax2 = plt.subplot(212, aspect='equal')
    ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90)
    ax2.set_title('column distribution across tables')
    plt.savefig(''.join([output, '_rc.png']), dpi=300)
 def summary(data, p_time):
    from operator import itemgetter
    from itertools import groupby
    scores = []
    continuous_tables = []
    total_tables = 0
    for page_number in data.keys():
        page = data[page_number]
        total_tables += len(page.keys())
        for table_number in page.keys():
            table = page[table_number]
            continuous_tables.append((page_number, table_number, table['ncols']))
            scores.append(table['score'])
    avg_score = np.mean(scores)
    ct_pages = []
    header_string = ""
    if len(continuous_tables) > 1:
        tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:])))
        for k, g in groupby(tables, key=itemgetter(2)):
            g = list(g)
            tables_same_ncols = set([int(t[0][5:]) for t in g])
            tables_same_ncols = sorted(list(tables_same_ncols))
            for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x):
                G = list(G)
                ct_pages.append((str(G[0][1]), str(G[-1][1])))
        result_headers = []
        for ct in ct_pages:
            header_idx = {}
            possible_headers = []
            ncols = 0
            for page_number in range(int(ct[0]), int(ct[1]) + 1):
                page = data['page-{0}'.format(page_number)]
                for table_number in page.keys():
                    table = page[table_number]
                    ncols = table['ncols']
                    for i, row in enumerate(table['data']):
                        try:
                            header_idx[tuple(row)].append(i)
                        except KeyError:
                            header_idx[tuple(row)] = [i]
            possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10]
            possible_headers = filter(lambda z: len(z) == ncols,
                [filter(lambda x: x != '', p_h) for p_h in possible_headers])
            modes = []
            for p_h in possible_headers:
                try:
                    modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count)))
                except KeyError:
                    pass
            header = modes[modes.index(min(modes, key=lambda x: x[1]))][0]
            result_headers.append(header)
        header_string = "Multi-page table headers*:\n"
        header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format(
            '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip(
            ct_pages, result_headers)])])
    avg_time = "Time taken per page: {0:.2f} seconds\n".format(
        p_time / float(len(data))) if len(data) != 1 else ""
    equal_ncols = "\nMulti-page tables on*: {0}\n".format(
        ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else ""
    stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols]
    stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n"
        "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats))
    print(''.join([stat_string, header_string]))
 def convert_to_html(table):
    html = ''
    html = ''.join([html, '<table border="1">\n'])
@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None):
    if f in ['csv', 'tsv']:
        import csv
        delimiter = ',' if f == 'csv' else '\t'
-        for page in sorted(data):
+        for page_number in sorted(data.keys()):
-            for table in range(len(data[page])):
+            for table_number in sorted(data[page_number].keys()):
-                dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f)
+                dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f)
                with open(os.path.join(output, dsvname), 'w') as outfile:
                    writer = csv.writer(
                        outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL)
-                    for row in data[page][table]:
+                    for row in data[page_number][table_number]['data']:
                        writer.writerow(row)
    elif f == 'html':
-        htmlname = '{}.html'.format(froot)
+        htmlname = '{0}.html'.format(froot)
-        for page in sorted(data):
+        for page_number in sorted(data.keys()):
-            for table in range(len(data[page])):
+            for table_number in sorted(data[page_number].keys()):
                with open(os.path.join(output, htmlname), 'a') as htmlfile:
-                    htmlfile.write(convert_to_html(data[page][table]))
+                    htmlfile.write(convert_to_html(data[page_number][table_number]['data']))
    elif f == 'json':
        import json
-        with open(os.path.join(output, '{}.json'.format(froot)), 'w') \
+        with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \
                as jsonfile:
            json.dump(data, jsonfile)
    elif f == 'xlsx':
@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None):
            from pyexcel_xlsx import save_data
            from collections import OrderedDict
            xlsx_data = OrderedDict()
-            for page in sorted(data):
+            for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                for table in range(len(data[page])):
+                for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])):
-                    sheet_name = '{0}_table_{1}'.format(page, table + 1)
+                    sheet_name = ''.join([page_number, '_', table_number])
                    xlsx_data.update({sheet_name:
-                                      [row for row in data[page][table]]})
+                                      [row for row in data[page_number][table_number]['data']]})
-            save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data)
+            save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data)
        except ImportError:
            print("link to install docs")
@ -147,16 +304,17 @@ if __name__ == '__main__':
    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
    logname, __ = os.path.splitext(filename)
-    logname += '.log'
+    logname = ''.join([logname, '.log'])
    scorename, __ = os.path.splitext(filename)
    scorename = ''.join([scorename, '_info.csv'])
    pngname, __ = os.path.splitext(filename)
    if args['--log']:
        FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
        if args['--output']:
            logname = os.path.join(args['--output'], os.path.basename(logname))
-            logging.basicConfig(
+        logging.basicConfig(
-                filename=logname, filemode='w', level=logging.DEBUG)
+            filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG)
        else:
            logging.basicConfig(
                filename=logname, filemode='w', level=logging.DEBUG)
    p = []
    if args['--pages'] == '1':
@ -173,47 +331,142 @@ if __name__ == '__main__':
                else:
                    p.append({'start': int(r), 'end': int(r)})
    margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
-            extractor = Lattice(Pdf(filename, pagenos=p, clean=True),
+            manager = Pdf(Lattice(
-                                fill=args['--fill'],
+                                  fill=args['--fill'],
-                                scale=int(args['--scale']),
+                                  scale=int(args['--scale']),
-                                jtol=int(args['--jtol']),
+                                  invert=args['--invert'],
-                                mtol=int(args['--mtol']),
+                                  jtol=int(args['--jtol']),
-                                invert=args['--invert'],
+                                  mtol=int(args['--mtol']),
-                                debug=args['--debug'],
+                                  pdf_margin=margin_tuple,
-                                verbose=args['--verbose'])
+                                  debug=args['--debug']),
-            data = extractor.get_tables()
+                          filename,
                          pagenos=p,
                          parallel=args['--parallel'],
                          clean=True)
            data = manager.extract()
            processing_time = time.time() - start_time
            vprint("Finished processing in", processing_time, "seconds")
            logging.info("Finished processing in " + str(processing_time) + " seconds")
            if args['--plot']:
                if args['--output']:
                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
                plot_type = args['--plot'].split(',')
                if 'page' in plot_type:
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            plot_table_barchart(table['r_nempty_cells'],
                                table['c_nempty_cells'],
                                table['empty_p'],
                                page_number,
                                table_number)
                if 'all' in plot_type:
                    plot_all_barchart(data, pngname)
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)
            if args['--summary']:
                summary(data, processing_time)
            if args['--save-info']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
                                ''.join([page_number, '_', table_number]),
                                table['nrows'],
                                table['ncols'],
                                table['empty_p'],
                                table['line_p'],
                                table['text_p'],
                                table['score']))
            if args['--debug']:
-                extractor.plot_geometry(args['--debug'])
+                manager.debug_plot()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
-            extractor = Stream(Pdf(filename, pagenos=p,
+            manager = Pdf(Stream(
-                                   char_margin=float(args['--cmargin']),
+                                 ncolumns=int(args['--ncols']),
-                                   line_margin=float(args['--lmargin']),
+                                 columns=args['--columns'],
-                                   word_margin=float(args['--wmargin']),
+                                 ytol=int(args['--ytol']),
-                                   clean=True),
+                                 mtol=int(args['--mtol']),
-                               ncolumns=int(args['--ncols']),
+                                 pdf_margin=margin_tuple,
-                               columns=args['--columns'],
+                                 debug=args['--debug']),
-                               ytol=int(args['--ytol']),
+                          filename,
-                               debug=args['--debug'],
+                          pagenos=p,
-                               verbose=args['--verbose'])
+                          parallel=args['--parallel'],
-            data = extractor.get_tables()
+                          clean=True)
            data = manager.extract()
            processing_time = time.time() - start_time
            vprint("Finished processing in", processing_time, "seconds")
            logging.info("Finished processing in " + str(processing_time) + " seconds")
            if args['--plot']:
                if args['--output']:
                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
                plot_type = args['--plot'].split(',')
                if 'page' in plot_type:
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            plot_table_barchart(table['r_nempty_cells'],
                                table['c_nempty_cells'],
                                table['empty_p'],
                                page_number,
                                table_number)
                if 'all' in plot_type:
                    plot_all_barchart(data, pngname)
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)
            if args['--summary']:
                summary(data, processing_time)
            if args['--save-info']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
                    score_file.write('table,nrows,ncols,empty_p,,score\n')
                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
                        page = data[page_number]
                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
                            table = page[table_number]
                            score_file.write('{0},{1},{2},{3},{4}\n'.format(
                                ''.join([page_number, '_', table_number]),
                                table['nrows'],
                                table['ncols'],
                                table['empty_p'],
                                table['score']))
            if args['--debug']:
-                extractor.plot_text()
+                manager.debug_plot()
        except Exception as e:
            logging.exception(e.message, exc_info=True)
            sys.exit()
-    if data is None:
+    if args['--debug']:
        print("See 'camelot <method> -h' for various parameters you can tweak.")
    else:
        output = filedir if args['--output'] is None else args['--output']
        write_to_disk(data, f=args['--format'],
                      output=output, filename=filename)
    vprint("finished in", time.time() - start_time, "seconds")
    logging.info("Time taken: " + str(time.time() - start_time) + " seconds")