Decouple debug geometry from tables

2018-09-05 15:18:31 +05:30 · 2018-09-05 15:18:31 +05:30 · b9d77cb983
parent 941994f0bf
commit b9d77cb983
7 changed files with 463 additions and 306 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
 >>> import camelot
 >>> tables = camelot.read_pdf("foo.pdf")
 >>> tables
-&lt;TableSet n=2&gt;
+&lt;TableList n=2&gt;
 >>> tables.to_csv(zip=True) # to_json, to_excel, to_html
 >>> tables[0]
 &lt;Table shape=(3,4)&gt;
@ -19,8 +19,8 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
    "time_taken": 0.5,
    "page": 1
 }
+>>> df = tables[0].df
 >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
->>> df = tables[0].to_df()
 </pre>

 Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
--- a/camelot/init.py
+++ b/camelot/init.py
@ -1,3 +1,4 @@
 from .__version__ import __version__

-from .io import read_pdf
+from .io import read_pdf
+from .plot import plot_geometry
--- a/camelot/core.py
+++ b/camelot/core.py
@ -21,7 +21,6 @@ class Cell(object):
        self.text = ''
        self.spanning_h = False
        self.spanning_v = False
-        self.image = None

    def __repr__(self):
        pass
@ -49,8 +48,6 @@ class Table(object):
        self.rows = rows
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in cols] for r in rows]
-        self.nocont_ = 0
-        self.image = None

    def __repr__(self):
        pass
@ -127,7 +124,7 @@ class Table(object):
                        J += 1

        for h in horizontal:
-            #  find closest y coord
+            # find closest y coord
            # iterate over x coords and find closest points
            i = [i for i, t in enumerate(self.rows)
                 if np.isclose(h[1], t[0], atol=jtol)]
@ -227,9 +224,66 @@ class Table(object):
        return ar


-class TableSet(object):
-    def __init__(self):
-        pass
+class TableList(list):
+    def __init__(self, tables):
+        self._tables = tables

    def __repr__(self):
-        pass
+        return '<{} tables={}>'.format(
+            self.__class__.__name__, len(self._tables))
+
+
+class Geometry(object):
+    def __init__(self):
+        self._text = []
+        self._images = []
+        self._segments = []
+        self._tables = []
+
+    @property
+    def text(self):
+        return self._text
+
+    @text.setter
+    def text(self, t):
+        self._text = t
+
+    @property
+    def images(self):
+        return self._images
+
+    @images.setter
+    def images(self, i):
+        self._images = i
+
+    @property
+    def segments(self):
+        return self._segments
+
+    @segments.setter
+    def segments(self, s):
+        self._segments = s
+
+    @property
+    def tables(self):
+        return self._tables
+
+    @tables.setter
+    def tables(self, tb):
+        self._tables = tb
+
+
+class GeometryList(object):
+    def __init__(self, geometry):
+        self._text = [g.text for g in geometry]
+        self._images = [g.images for g in geometry]
+        self._segments = [g.segments for g in geometry]
+        self._tables = [g.tables for g in geometry]
+
+    def __repr__(self):
+        return '<{} text={} images={} segments={} tables={}>'.format(
+            self.__class__.__name__,
+            len(self._text),
+            len(self._images),
+            len(self._segments),
+            len(self._tables))
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -74,10 +74,11 @@ class PDFHandler(object):
            self.__save_page(self.filename, p, self.temp)
        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
                 for p in self.pages]
-        tables = {}
+        tables = []
+        geometry = []
        parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
        for p in pages:
-            table = parser.get_tables(p)
-            if table is not None:
-                tables.update(table)
-        return tables
+            t, g = parser.extract_tables(p)
+            tables.extend(t)
+            geometry.extend(g)
+        return TableList(tables), GeometryList(geometry)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -4,4 +4,5 @@ from .handlers import PDFHandler
 def read_pdf(filepath, pages='1', mesh=False, **kwargs):
    # explicit type conversion
    p = PDFHandler(filepath, pages)
-    return p.parse(mesh=mesh, **kwargs)
+    tables, __ = p.parse(mesh=mesh, **kwargs)
+    return tables
--- a/camelot/parsers.py
+++ b/camelot/parsers.py
@ -10,7 +10,7 @@ import subprocess

 import numpy as np

-from .core import Table
+from .core import Table, Geometry
 from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
                               find_table_joints)
 from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
@ -30,192 +30,6 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)


-def _text_bbox(t_bbox):
-    """Returns bounding box for the text present on a page.
-
-    Parameters
-    ----------
-    t_bbox : dict
-        Dict with two keys 'horizontal' and 'vertical' with lists of
-        LTTextLineHorizontals and LTTextLineVerticals respectively.
-
-    Returns
-    -------
-    text_bbox : tuple
-        Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
-        space.
-    """
-    xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
-    ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
-    xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
-    ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
-    text_bbox = (xmin, ymin, xmax, ymax)
-    return text_bbox
-
-
-def _group_rows(text, ytol=2):
-    """Groups PDFMiner text objects into rows using their
-    y-coordinates taking into account some tolerance ytol.
-
-    Parameters
-    ----------
-    text : list
-        List of PDFMiner text objects.
-
-    ytol : int
-        Tolerance parameter.
-        (optional, default: 2)
-
-    Returns
-    -------
-    rows : list
-        Two-dimensional list of text objects grouped into rows.
-    """
-    row_y = 0
-    rows = []
-    temp = []
-    for t in text:
-        # is checking for upright necessary?
-        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
-        # type(obj) is LTChar]):
-        if t.get_text().strip():
-            if not np.isclose(row_y, t.y0, atol=ytol):
-                rows.append(sorted(temp, key=lambda t: t.x0))
-                temp = []
-                row_y = t.y0
-            temp.append(t)
-    rows.append(sorted(temp, key=lambda t: t.x0))
-    __ = rows.pop(0) # hacky
-    return rows
-
-
-def _merge_columns(l, mtol=0):
-    """Merges column boundaries if they overlap or lie within some
-    tolerance mtol.
-
-    Parameters
-    ----------
-    l : list
-        List of column coordinate tuples.
-
-    mtol : int
-        TODO
-        (optional, default: 0)
-
-    Returns
-    -------
-    merged : list
-        List of merged column coordinate tuples.
-    """
-    merged = []
-    for higher in l:
-        if not merged:
-            merged.append(higher)
-        else:
-            lower = merged[-1]
-            if mtol >= 0:
-                if (higher[0] <= lower[1] or
-                        np.isclose(higher[0], lower[1], atol=mtol)):
-                    upper_bound = max(lower[1], higher[1])
-                    lower_bound = min(lower[0], higher[0])
-                    merged[-1] = (lower_bound, upper_bound)
-                else:
-                    merged.append(higher)
-            elif mtol < 0:
-                if higher[0] <= lower[1]:
-                    if np.isclose(higher[0], lower[1], atol=abs(mtol)):
-                        merged.append(higher)
-                    else:
-                        upper_bound = max(lower[1], higher[1])
-                        lower_bound = min(lower[0], higher[0])
-                        merged[-1] = (lower_bound, upper_bound)
-                else:
-                    merged.append(higher)
-    return merged
-
-
-def _join_rows(rows_grouped, text_y_max, text_y_min):
-    """Makes row coordinates continuous.
-
-    Parameters
-    ----------
-    rows_grouped : list
-        Two-dimensional list of text objects grouped into rows.
-
-    text_y_max : int
-
-    text_y_min : int
-
-    Returns
-    -------
-    rows : list
-        List of continuous row coordinate tuples.
-    """
-    row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
-                if len(r) > 0 else 0 for r in rows_grouped]
-    rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
-    rows.insert(0, text_y_max)
-    rows.append(text_y_min)
-    rows = [(rows[i], rows[i + 1])
-            for i in range(0, len(rows) - 1)]
-    return rows
-
-
-def _join_columns(cols, text_x_min, text_x_max):
-    """Makes column coordinates continuous.
-
-    Parameters
-    ----------
-    cols : list
-        List of column coordinate tuples.
-
-    text_x_min : int
-
-    text_y_max : int
-
-    Returns
-    -------
-    cols : list
-        Updated list of column coordinate tuples.
-    """
-    cols = sorted(cols)
-    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
-    cols.insert(0, text_x_min)
-    cols.append(text_x_max)
-    cols = [(cols[i], cols[i + 1])
-            for i in range(0, len(cols) - 1)]
-    return cols
-
-
-def _add_columns(cols, text, ytol):
-    """Adds columns to existing list by taking into account
-    the text that lies outside the current column coordinates.
-
-    Parameters
-    ----------
-    cols : list
-        List of column coordinate tuples.
-
-    text : list
-        List of PDFMiner text objects.
-
-    ytol : int
-        Tolerance parameter.
-
-    Returns
-    -------
-    cols : list
-        Updated list of column coordinate tuples.
-    """
-    if text:
-        text = _group_rows(text, ytol=ytol)
-        elements = [len(r) for r in text]
-        new_cols = [(t.x0, t.x1)
-            for r in text if len(r) == max(elements) for t in r]
-        cols.extend(_merge_columns(sorted(new_cols)))
-    return cols
-
-
 class Stream:
    """Stream looks for spaces between text elements to form a table.

@ -283,7 +97,193 @@ class Stream:
        self.flag_size = flag_size
        self.debug = debug

-    def get_tables(self, pdfname):
+    @staticmethod
+    def _text_bbox(t_bbox):
+        """Returns bounding box for the text present on a page.
+
+        Parameters
+        ----------
+        t_bbox : dict
+            Dict with two keys 'horizontal' and 'vertical' with lists of
+            LTTextLineHorizontals and LTTextLineVerticals respectively.
+
+        Returns
+        -------
+        text_bbox : tuple
+            Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
+            space.
+        """
+        xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
+        ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
+        xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
+        ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
+        text_bbox = (xmin, ymin, xmax, ymax)
+        return text_bbox
+
+    @staticmethod
+    def _group_rows(text, ytol=2):
+        """Groups PDFMiner text objects into rows using their
+        y-coordinates taking into account some tolerance ytol.
+
+        Parameters
+        ----------
+        text : list
+            List of PDFMiner text objects.
+
+        ytol : int
+            Tolerance parameter.
+            (optional, default: 2)
+
+        Returns
+        -------
+        rows : list
+            Two-dimensional list of text objects grouped into rows.
+        """
+        row_y = 0
+        rows = []
+        temp = []
+        for t in text:
+            # is checking for upright necessary?
+            # if t.get_text().strip() and all([obj.upright for obj in t._objs if
+            # type(obj) is LTChar]):
+            if t.get_text().strip():
+                if not np.isclose(row_y, t.y0, atol=ytol):
+                    rows.append(sorted(temp, key=lambda t: t.x0))
+                    temp = []
+                    row_y = t.y0
+                temp.append(t)
+        rows.append(sorted(temp, key=lambda t: t.x0))
+        __ = rows.pop(0) # hacky
+        return rows
+
+    @staticmethod
+    def _merge_columns(l, mtol=0):
+        """Merges column boundaries if they overlap or lie within some
+        tolerance mtol.
+
+        Parameters
+        ----------
+        l : list
+            List of column coordinate tuples.
+
+        mtol : int
+            TODO
+            (optional, default: 0)
+
+        Returns
+        -------
+        merged : list
+            List of merged column coordinate tuples.
+        """
+        merged = []
+        for higher in l:
+            if not merged:
+                merged.append(higher)
+            else:
+                lower = merged[-1]
+                if mtol >= 0:
+                    if (higher[0] <= lower[1] or
+                            np.isclose(higher[0], lower[1], atol=mtol)):
+                        upper_bound = max(lower[1], higher[1])
+                        lower_bound = min(lower[0], higher[0])
+                        merged[-1] = (lower_bound, upper_bound)
+                    else:
+                        merged.append(higher)
+                elif mtol < 0:
+                    if higher[0] <= lower[1]:
+                        if np.isclose(higher[0], lower[1], atol=abs(mtol)):
+                            merged.append(higher)
+                        else:
+                            upper_bound = max(lower[1], higher[1])
+                            lower_bound = min(lower[0], higher[0])
+                            merged[-1] = (lower_bound, upper_bound)
+                    else:
+                        merged.append(higher)
+        return merged
+
+    @staticmethod
+    def _join_rows(rows_grouped, text_y_max, text_y_min):
+        """Makes row coordinates continuous.
+
+        Parameters
+        ----------
+        rows_grouped : list
+            Two-dimensional list of text objects grouped into rows.
+
+        text_y_max : int
+
+        text_y_min : int
+
+        Returns
+        -------
+        rows : list
+            List of continuous row coordinate tuples.
+        """
+        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
+                    if len(r) > 0 else 0 for r in rows_grouped]
+        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
+        rows.insert(0, text_y_max)
+        rows.append(text_y_min)
+        rows = [(rows[i], rows[i + 1])
+                for i in range(0, len(rows) - 1)]
+        return rows
+
+    @staticmethod
+    def _add_columns(cols, text, ytol):
+        """Adds columns to existing list by taking into account
+        the text that lies outside the current column coordinates.
+
+        Parameters
+        ----------
+        cols : list
+            List of column coordinate tuples.
+
+        text : list
+            List of PDFMiner text objects.
+
+        ytol : int
+            Tolerance parameter.
+
+        Returns
+        -------
+        cols : list
+            Updated list of column coordinate tuples.
+        """
+        if text:
+            text = Stream._group_rows(text, ytol=ytol)
+            elements = [len(r) for r in text]
+            new_cols = [(t.x0, t.x1)
+                for r in text if len(r) == max(elements) for t in r]
+            cols.extend(Stream._merge_columns(sorted(new_cols)))
+        return cols
+
+    @staticmethod
+    def _join_columns(cols, text_x_min, text_x_max):
+        """Makes column coordinates continuous.
+
+        Parameters
+        ----------
+        cols : list
+            List of column coordinate tuples.
+
+        text_x_min : int
+
+        text_y_max : int
+
+        Returns
+        -------
+        cols : list
+            Updated list of column coordinate tuples.
+        """
+        cols = sorted(cols)
+        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
+        cols.insert(0, text_x_min)
+        cols.append(text_x_max)
+        cols = [(cols[i], cols[i + 1])
+                for i in range(0, len(cols) - 1)]
+        return cols
+
+    def extract_tables(self, pdfname):
        """Expects a single page pdf as input with rotation corrected.

        Parameters
@ -308,11 +308,13 @@ class Stream:
                os.path.basename(bname)))
            return {os.path.basename(bname): None}

+        g = Geometry()
        if self.debug:
-            self.debug_text = []
-            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
-            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
-            return None
+            text = []
+            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
+            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
+            g.text = text
+            return [None], [g]

        if self.table_area is not None:
            if self.columns is not None:
@ -354,9 +356,9 @@ class Stream:
            table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
            for direction in t_bbox:
                t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
-            text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
-            rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
-            rows = _join_rows(rows_grouped, text_y_max, text_y_min)
+            text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox)
+            rows_grouped = self._group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
+            rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
            elements = [len(r) for r in rows_grouped]

            guess = False
@ -380,7 +382,7 @@ class Stream:
                        os.path.basename(bname)))
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
-                cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
+                cols = self._merge_columns(sorted(cols), mtol=mtolerance[table_no])
                inner_text = []
                for i in range(1, len(cols)):
                    left = cols[i - 1][1]
@ -392,8 +394,8 @@ class Stream:
                              for t in t_bbox[direction]
                              if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                inner_text.extend(outer_text)
-                cols = _add_columns(cols, inner_text, ytolerance[table_no])
-                cols = _join_columns(cols, text_x_min, text_x_max)
+                cols = self._add_columns(cols, inner_text, ytolerance[table_no])
+                cols = self._join_columns(cols, text_x_min, text_x_max)

            table = Table(cols, rows)
            table = table.set_all_edges()
@ -433,87 +435,6 @@ class Stream:
        return page


-def _reduce_index(t, idx, shift_text):
-    """Reduces index of a text object if it lies within a spanning
-    cell.
-
-    Parameters
-    ----------
-    table : object
-        camelot.table.Table
-
-    idx : list
-        List of tuples of the form (r_idx, c_idx, text).
-
-    shift_text : list
-        {'l', 'r', 't', 'b'}
-        Select one or more from above and pass them as a list to
-        specify where the text in a spanning cell should flow.
-
-    Returns
-    -------
-    indices : list
-        List of tuples of the form (idx, text) where idx is the reduced
-        index of row/column and text is the an lttextline substring.
-    """
-    indices = []
-    for r_idx, c_idx, text in idx:
-        for d in shift_text:
-            if d == 'l':
-                if t.cells[r_idx][c_idx].spanning_h:
-                    while not t.cells[r_idx][c_idx].left:
-                        c_idx -= 1
-            if d == 'r':
-                if t.cells[r_idx][c_idx].spanning_h:
-                    while not t.cells[r_idx][c_idx].right:
-                        c_idx += 1
-            if d == 't':
-                if t.cells[r_idx][c_idx].spanning_v:
-                    while not t.cells[r_idx][c_idx].top:
-                        r_idx -= 1
-            if d == 'b':
-                if t.cells[r_idx][c_idx].spanning_v:
-                    while not t.cells[r_idx][c_idx].bottom:
-                        r_idx += 1
-        indices.append((r_idx, c_idx, text))
-    return indices
-
-
-def _fill_spanning(t, fill=None):
-    """Fills spanning cells.
-
-    Parameters
-    ----------
-    t : object
-        camelot.table.Table
-
-    fill : list
-        {'h', 'v'}
-        Specify to fill spanning cells in horizontal or vertical
-        direction.
-        (optional, default: None)
-
-    Returns
-    -------
-    t : object
-        camelot.table.Table
-    """
-    for f in fill:
-        if f == "h":
-            for i in range(len(t.cells)):
-                for j in range(len(t.cells[i])):
-                    if t.cells[i][j].get_text().strip() == '':
-                        if t.cells[i][j].spanning_h and not t.cells[i][j].left:
-                            t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
-        elif f == "v":
-            for i in range(len(t.cells)):
-                for j in range(len(t.cells[i])):
-                    if t.cells[i][j].get_text().strip() == '':
-                        if t.cells[i][j].spanning_v and not t.cells[i][j].top:
-                            t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
-    return t
-
-
 class Lattice:
    """Lattice looks for lines in the pdf to form a table.

@ -617,7 +538,88 @@ class Lattice:
        self.shift_text = shift_text
        self.debug = debug

-    def get_tables(self, pdfname):
+    @staticmethod
+    def _reduce_index(t, idx, shift_text):
+        """Reduces index of a text object if it lies within a spanning
+        cell.
+
+        Parameters
+        ----------
+        table : object
+            camelot.table.Table
+
+        idx : list
+            List of tuples of the form (r_idx, c_idx, text).
+
+        shift_text : list
+            {'l', 'r', 't', 'b'}
+            Select one or more from above and pass them as a list to
+            specify where the text in a spanning cell should flow.
+
+        Returns
+        -------
+        indices : list
+            List of tuples of the form (idx, text) where idx is the reduced
+            index of row/column and text is the an lttextline substring.
+        """
+        indices = []
+        for r_idx, c_idx, text in idx:
+            for d in shift_text:
+                if d == 'l':
+                    if t.cells[r_idx][c_idx].spanning_h:
+                        while not t.cells[r_idx][c_idx].left:
+                            c_idx -= 1
+                if d == 'r':
+                    if t.cells[r_idx][c_idx].spanning_h:
+                        while not t.cells[r_idx][c_idx].right:
+                            c_idx += 1
+                if d == 't':
+                    if t.cells[r_idx][c_idx].spanning_v:
+                        while not t.cells[r_idx][c_idx].top:
+                            r_idx -= 1
+                if d == 'b':
+                    if t.cells[r_idx][c_idx].spanning_v:
+                        while not t.cells[r_idx][c_idx].bottom:
+                            r_idx += 1
+            indices.append((r_idx, c_idx, text))
+        return indices
+
+
+    def _fill_spanning(t, fill=None):
+        """Fills spanning cells.
+
+        Parameters
+        ----------
+        t : object
+            camelot.table.Table
+
+        fill : list
+            {'h', 'v'}
+            Specify to fill spanning cells in horizontal or vertical
+            direction.
+            (optional, default: None)
+
+        Returns
+        -------
+        t : object
+            camelot.table.Table
+        """
+        for f in fill:
+            if f == "h":
+                for i in range(len(t.cells)):
+                    for j in range(len(t.cells[i])):
+                        if t.cells[i][j].get_text().strip() == '':
+                            if t.cells[i][j].spanning_h and not t.cells[i][j].left:
+                                t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
+            elif f == "v":
+                for i in range(len(t.cells)):
+                    for j in range(len(t.cells[i])):
+                        if t.cells[i][j].get_text().strip() == '':
+                            if t.cells[i][j].spanning_v and not t.cells[i][j].top:
+                                t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
+        return t
+
+    def extract_tables(self, pdfname):
        """Expects a single page pdf as input with rotation corrected.

        Parameters
@ -696,15 +698,16 @@ class Lattice:
        else:
            jtolerance = copy.deepcopy(self.jtol)

+        g = Geometry()
        if self.debug:
-            self.debug_images = (img, table_bbox)
+            g.images = [(img, table_bbox)]

        table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
            h_segments, factors_pdf)

        if self.debug:
-            self.debug_segments = (v_segments, h_segments)
-            self.debug_tables = []
+            g.segments = [(v_segments, h_segments)]
+            _tables = []

        page = {}
        tables = {}
@ -737,15 +740,13 @@ class Lattice:
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
            table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
-            nouse = table.nocont_ / (len(v_s) + len(h_s))
-            table_data['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
            table = table.set_border_edges()

            if self.debug:
-                self.debug_tables.append(table)
+                _tables.append(table)

            assignment_errors = []
            table_data['split_text'] = []
@ -757,7 +758,7 @@ class Lattice:
                        flag_size=self.flag_size)
                    if indices[:2] != (-1, -1):
                        assignment_errors.append(error)
-                        indices = _reduce_index(table, indices, shift_text=self.shift_text)
+                        indices = self._reduce_index(table, indices, shift_text=self.shift_text)
                        if len(indices) > 1:
                            table_data['split_text'].append(indices)
                        for r_idx, c_idx, text in indices:
@ -768,7 +769,7 @@ class Lattice:
            table_data['score'] = score

            if self.fill is not None:
-                table = _fill_spanning(table, fill=self.fill)
+                table = self._fill_spanning(table, fill=self.fill)
            ar = table.get_list()
            ar = encode_list(ar)
            table_data['data'] = ar
@ -782,6 +783,7 @@ class Lattice:
        page[os.path.basename(bname)] = tables

        if self.debug:
-            return None
+            g.tables = _tables
+            return [None], [g]

        return page
--- a/camelot/plot.py
+++ b/camelot/plot.py
@ -0,0 +1,98 @@
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+from .handlers import PDFHandler
+
+
+def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
+    # explicit type conversion
+    p = PDFHandler(filepath, pages)
+    kwargs.update({'debug': geometry_type})
+    __, geometry = p.parse(mesh=mesh, **kwargs)
+
+    if geometry_type == 'text':
+        for text in geometry.text:
+            fig = plt.figure()
+            ax = fig.add_subplot(111, aspect='equal')
+            xs, ys = [], []
+            for t in text:
+                xs.extend([t[0], t[1]])
+                ys.extend([t[2], t[3]])
+                ax.add_patch(
+                    patches.Rectangle(
+                        (t[0], t[1]),
+                        t[2] - t[0],
+                        t[3] - t[1]
+                    )
+                )
+            ax.set_xlim(min(xs) - 10, max(xs) + 10)
+            ax.set_ylim(min(ys) - 10, max(ys) + 10)
+            plt.show()
+    elif geometry_type == 'contour':
+        try:
+            for img, table_bbox in geometry.images:
+                for t in table_bbox.keys():
+                    cv2.rectangle(img, (t[0], t[1]),
+                                  (t[2], t[3]), (255, 0, 0), 3)
+                plt.imshow(img)
+                plt.show()
+        except AttributeError:
+            raise ValueError("This option can only be used with Lattice.")
+    elif geometry_type == 'joint':
+        try:
+            for img, table_bbox in geometry.images:
+                x_coord = []
+                y_coord = []
+                for k in table_bbox.keys():
+                    for coord in table_bbox[k]:
+                        x_coord.append(coord[0])
+                        y_coord.append(coord[1])
+                max_x, max_y = max(x_coord), max(y_coord)
+                plt.plot(x_coord, y_coord, 'ro')
+                plt.axis([0, max_x + 100, max_y + 100, 0])
+                plt.imshow(img)
+                plt.show()
+        except AttributeError:
+            raise ValueError("This option can only be used with Lattice.")
+    elif geometry_type == 'line':
+        try:
+            for v_s, h_s in geometry.segments:
+                for v in v_s:
+                    plt.plot([v[0], v[2]], [v[1], v[3]])
+                for h in h_s:
+                    plt.plot([h[0], h[2]], [h[1], h[3]])
+                plt.show()
+        except AttributeError:
+            raise ValueError("This option can only be used with Lattice.")
+    elif geometry_type == 'table':
+        try:
+            for tables in geometry.tables:
+                for table in tables:
+                    for r in range(len(table.rows)):
+                        for c in range(len(table.cols)):
+                            if table.cells[r][c].left:
+                                plt.plot([table.cells[r][c].lb[0],
+                                          table.cells[r][c].lt[0]],
+                                         [table.cells[r][c].lb[1],
+                                          table.cells[r][c].lt[1]])
+                            if table.cells[r][c].right:
+                                plt.plot([table.cells[r][c].rb[0],
+                                          table.cells[r][c].rt[0]],
+                                         [table.cells[r][c].rb[1],
+                                          table.cells[r][c].rt[1]])
+                            if table.cells[r][c].top:
+                                plt.plot([table.cells[r][c].lt[0],
+                                          table.cells[r][c].rt[0]],
+                                         [table.cells[r][c].lt[1],
+                                          table.cells[r][c].rt[1]])
+                            if table.cells[r][c].bottom:
+                                plt.plot([table.cells[r][c].lb[0],
+                                          table.cells[r][c].rb[0]],
+                                         [table.cells[r][c].lb[1],
+                                          table.cells[r][c].rb[1]])
+                plt.show()
+        except AttributeError:
+            raise ValueError("This option can only be used with Lattice.")
+    else:
+        raise UserWarning("This method can only be called after"
+            " debug has been specified.")