Decouple debug geometry from tables

2018-09-05 15:18:31 +05:30 · 2018-09-05 15:18:31 +05:30 · b9d77cb983
parent 941994f0bf
commit b9d77cb983
7 changed files with 463 additions and 306 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
 >>> import camelot
 >>> tables = camelot.read_pdf("foo.pdf")
 >>> tables
-&lt;TableSet n=2&gt;
+&lt;TableList n=2&gt;
 >>> tables.to_csv(zip=True) # to_json, to_excel, to_html
 >>> tables[0]
 &lt;Table shape=(3,4)&gt;
@ -19,8 +19,8 @@ Camelot is a Python 2.7 library and command-line tool for getting tables out of
    "time_taken": 0.5,
    "page": 1
 }
 >>> df = tables[0].df
 >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
 >>> df = tables[0].to_df()
 </pre>
 Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
--- a/camelot/init.py
+++ b/camelot/init.py
@ -1,3 +1,4 @@
 from .__version__ import __version__
 from .io import read_pdf
 from .plot import plot_geometry
--- a/camelot/core.py
+++ b/camelot/core.py
@ -21,7 +21,6 @@ class Cell(object):
        self.text = ''
        self.spanning_h = False
        self.spanning_v = False
        self.image = None
    def __repr__(self):
        pass
@ -49,8 +48,6 @@ class Table(object):
        self.rows = rows
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in cols] for r in rows]
        self.nocont_ = 0
        self.image = None
    def __repr__(self):
        pass
@ -227,9 +224,66 @@ class Table(object):
        return ar
-class TableSet(object):
+class TableList(list):
-    def __init__(self):
+    def __init__(self, tables):
-        pass
+        self._tables = tables
    def __repr__(self):
-        pass
+        return '<{} tables={}>'.format(
            self.__class__.__name__, len(self._tables))
 class Geometry(object):
    def __init__(self):
        self._text = []
        self._images = []
        self._segments = []
        self._tables = []
    @property
    def text(self):
        return self._text
    @text.setter
    def text(self, t):
        self._text = t
    @property
    def images(self):
        return self._images
    @images.setter
    def images(self, i):
        self._images = i
    @property
    def segments(self):
        return self._segments
    @segments.setter
    def segments(self, s):
        self._segments = s
    @property
    def tables(self):
        return self._tables
    @tables.setter
    def tables(self, tb):
        self._tables = tb
 class GeometryList(object):
    def __init__(self, geometry):
        self._text = [g.text for g in geometry]
        self._images = [g.images for g in geometry]
        self._segments = [g.segments for g in geometry]
        self._tables = [g.tables for g in geometry]
    def __repr__(self):
        return '<{} text={} images={} segments={} tables={}>'.format(
            self.__class__.__name__,
            len(self._text),
            len(self._images),
            len(self._segments),
            len(self._tables))
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -74,10 +74,11 @@ class PDFHandler(object):
            self.__save_page(self.filename, p, self.temp)
        pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p))
                 for p in self.pages]
-        tables = {}
+        tables = []
        geometry = []
        parser = Stream(**kwargs) if not mesh else Lattice(**kwargs)
        for p in pages:
-            table = parser.get_tables(p)
+            t, g = parser.extract_tables(p)
-            if table is not None:
+            tables.extend(t)
-                tables.update(table)
+            geometry.extend(g)
-        return tables
+        return TableList(tables), GeometryList(geometry)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -4,4 +4,5 @@ from .handlers import PDFHandler
 def read_pdf(filepath, pages='1', mesh=False, **kwargs):
    # explicit type conversion
    p = PDFHandler(filepath, pages)
-    return p.parse(mesh=mesh, **kwargs)
+    tables, __ = p.parse(mesh=mesh, **kwargs)
    return tables
--- a/camelot/parsers.py
+++ b/camelot/parsers.py
@ -10,7 +10,7 @@ import subprocess
 import numpy as np
-from .core import Table
+from .core import Table, Geometry
 from .image_processing import (adaptive_threshold, find_lines, find_table_contours,
                               find_table_joints)
 from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
@ -30,192 +30,6 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _text_bbox(t_bbox):
    """Returns bounding box for the text present on a page.
    Parameters
    ----------
    t_bbox : dict
        Dict with two keys 'horizontal' and 'vertical' with lists of
        LTTextLineHorizontals and LTTextLineVerticals respectively.
    Returns
    -------
    text_bbox : tuple
        Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
        space.
    """
    xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
    ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
    xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
    ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
    text_bbox = (xmin, ymin, xmax, ymax)
    return text_bbox
 def _group_rows(text, ytol=2):
    """Groups PDFMiner text objects into rows using their
    y-coordinates taking into account some tolerance ytol.
    Parameters
    ----------
    text : list
        List of PDFMiner text objects.
    ytol : int
        Tolerance parameter.
        (optional, default: 2)
    Returns
    -------
    rows : list
        Two-dimensional list of text objects grouped into rows.
    """
    row_y = 0
    rows = []
    temp = []
    for t in text:
        # is checking for upright necessary?
        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(row_y, t.y0, atol=ytol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                row_y = t.y0
            temp.append(t)
    rows.append(sorted(temp, key=lambda t: t.x0))
    __ = rows.pop(0) # hacky
    return rows
 def _merge_columns(l, mtol=0):
    """Merges column boundaries if they overlap or lie within some
    tolerance mtol.
    Parameters
    ----------
    l : list
        List of column coordinate tuples.
    mtol : int
        TODO
        (optional, default: 0)
    Returns
    -------
    merged : list
        List of merged column coordinate tuples.
    """
    merged = []
    for higher in l:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if mtol >= 0:
                if (higher[0] <= lower[1] or
                        np.isclose(higher[0], lower[1], atol=mtol)):
                    upper_bound = max(lower[1], higher[1])
                    lower_bound = min(lower[0], higher[0])
                    merged[-1] = (lower_bound, upper_bound)
                else:
                    merged.append(higher)
            elif mtol < 0:
                if higher[0] <= lower[1]:
                    if np.isclose(higher[0], lower[1], atol=abs(mtol)):
                        merged.append(higher)
                    else:
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                else:
                    merged.append(higher)
    return merged
 def _join_rows(rows_grouped, text_y_max, text_y_min):
    """Makes row coordinates continuous.
    Parameters
    ----------
    rows_grouped : list
        Two-dimensional list of text objects grouped into rows.
    text_y_max : int
    text_y_min : int
    Returns
    -------
    rows : list
        List of continuous row coordinate tuples.
    """
    row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                if len(r) > 0 else 0 for r in rows_grouped]
    rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
    rows.insert(0, text_y_max)
    rows.append(text_y_min)
    rows = [(rows[i], rows[i + 1])
            for i in range(0, len(rows) - 1)]
    return rows
 def _join_columns(cols, text_x_min, text_x_max):
    """Makes column coordinates continuous.
    Parameters
    ----------
    cols : list
        List of column coordinate tuples.
    text_x_min : int
    text_y_max : int
    Returns
    -------
    cols : list
        Updated list of column coordinate tuples.
    """
    cols = sorted(cols)
    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
    cols.insert(0, text_x_min)
    cols.append(text_x_max)
    cols = [(cols[i], cols[i + 1])
            for i in range(0, len(cols) - 1)]
    return cols
 def _add_columns(cols, text, ytol):
    """Adds columns to existing list by taking into account
    the text that lies outside the current column coordinates.
    Parameters
    ----------
    cols : list
        List of column coordinate tuples.
    text : list
        List of PDFMiner text objects.
    ytol : int
        Tolerance parameter.
    Returns
    -------
    cols : list
        Updated list of column coordinate tuples.
    """
    if text:
        text = _group_rows(text, ytol=ytol)
        elements = [len(r) for r in text]
        new_cols = [(t.x0, t.x1)
            for r in text if len(r) == max(elements) for t in r]
        cols.extend(_merge_columns(sorted(new_cols)))
    return cols
 class Stream:
    """Stream looks for spaces between text elements to form a table.
@ -283,7 +97,193 @@ class Stream:
        self.flag_size = flag_size
        self.debug = debug
-    def get_tables(self, pdfname):
+    @staticmethod
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.
        Returns
        -------
        text_bbox : tuple
            Tuple of the form (x0, y0, x1, y1) in PDFMiner's coordinate
            space.
        """
        xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
        ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
        xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
        ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
        text_bbox = (xmin, ymin, xmax, ymax)
        return text_bbox
    @staticmethod
    def _group_rows(text, ytol=2):
        """Groups PDFMiner text objects into rows using their
        y-coordinates taking into account some tolerance ytol.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        ytol : int
            Tolerance parameter.
            (optional, default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = 0
        rows = []
        temp = []
        for t in text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright for obj in t._objs if
            # type(obj) is LTChar]):
            if t.get_text().strip():
                if not np.isclose(row_y, t.y0, atol=ytol):
                    rows.append(sorted(temp, key=lambda t: t.x0))
                    temp = []
                    row_y = t.y0
                temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        __ = rows.pop(0) # hacky
        return rows
    @staticmethod
    def _merge_columns(l, mtol=0):
        """Merges column boundaries if they overlap or lie within some
        tolerance mtol.
        Parameters
        ----------
        l : list
            List of column coordinate tuples.
        mtol : int
            TODO
            (optional, default: 0)
        Returns
        -------
        merged : list
            List of merged column coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if mtol >= 0:
                    if (higher[0] <= lower[1] or
                            np.isclose(higher[0], lower[1], atol=mtol)):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif mtol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1], atol=abs(mtol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row coordinate tuples.
        """
        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                    if len(r) > 0 else 0 for r in rows_grouped]
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
        rows.insert(0, text_y_max)
        rows.append(text_y_min)
        rows = [(rows[i], rows[i + 1])
                for i in range(0, len(rows) - 1)]
        return rows
    @staticmethod
    def _add_columns(cols, text, ytol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column coordinates.
        Parameters
        ----------
        cols : list
            List of column coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
            Tolerance parameter.
        Returns
        -------
        cols : list
            Updated list of column coordinate tuples.
        """
        if text:
            text = Stream._group_rows(text, ytol=ytol)
            elements = [len(r) for r in text]
            new_cols = [(t.x0, t.x1)
                for r in text if len(r) == max(elements) for t in r]
            cols.extend(Stream._merge_columns(sorted(new_cols)))
        return cols
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1])
                for i in range(0, len(cols) - 1)]
        return cols
    def extract_tables(self, pdfname):
        """Expects a single page pdf as input with rotation corrected.
        Parameters
@ -308,11 +308,13 @@ class Stream:
                os.path.basename(bname)))
            return {os.path.basename(bname): None}
        g = Geometry()
        if self.debug:
-            self.debug_text = []
+            text = []
-            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
+            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
-            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
+            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
-            return None
+            g.text = text
            return [None], [g]
        if self.table_area is not None:
            if self.columns is not None:
@ -354,9 +356,9 @@ class Stream:
            table_data['text_p'] = 100 * (1 - (len(char_bbox) / len(ltchar)))
            for direction in t_bbox:
                t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0))
-            text_x_min, text_y_min, text_x_max, text_y_max = _text_bbox(t_bbox)
+            text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(t_bbox)
-            rows_grouped = _group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
+            rows_grouped = self._group_rows(t_bbox['horizontal'], ytol=ytolerance[table_no])
-            rows = _join_rows(rows_grouped, text_y_max, text_y_min)
+            rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
            elements = [len(r) for r in rows_grouped]
            guess = False
@ -380,7 +382,7 @@ class Stream:
                        os.path.basename(bname)))
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
-                cols = _merge_columns(sorted(cols), mtol=mtolerance[table_no])
+                cols = self._merge_columns(sorted(cols), mtol=mtolerance[table_no])
                inner_text = []
                for i in range(1, len(cols)):
                    left = cols[i - 1][1]
@ -392,8 +394,8 @@ class Stream:
                              for t in t_bbox[direction]
                              if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                inner_text.extend(outer_text)
-                cols = _add_columns(cols, inner_text, ytolerance[table_no])
+                cols = self._add_columns(cols, inner_text, ytolerance[table_no])
-                cols = _join_columns(cols, text_x_min, text_x_max)
+                cols = self._join_columns(cols, text_x_min, text_x_max)
            table = Table(cols, rows)
            table = table.set_all_edges()
@ -433,87 +435,6 @@ class Stream:
        return page
 def _reduce_index(t, idx, shift_text):
    """Reduces index of a text object if it lies within a spanning
    cell.
    Parameters
    ----------
    table : object
        camelot.table.Table
    idx : list
        List of tuples of the form (r_idx, c_idx, text).
    shift_text : list
        {'l', 'r', 't', 'b'}
        Select one or more from above and pass them as a list to
        specify where the text in a spanning cell should flow.
    Returns
    -------
    indices : list
        List of tuples of the form (idx, text) where idx is the reduced
        index of row/column and text is the an lttextline substring.
    """
    indices = []
    for r_idx, c_idx, text in idx:
        for d in shift_text:
            if d == 'l':
                if t.cells[r_idx][c_idx].spanning_h:
                    while not t.cells[r_idx][c_idx].left:
                        c_idx -= 1
            if d == 'r':
                if t.cells[r_idx][c_idx].spanning_h:
                    while not t.cells[r_idx][c_idx].right:
                        c_idx += 1
            if d == 't':
                if t.cells[r_idx][c_idx].spanning_v:
                    while not t.cells[r_idx][c_idx].top:
                        r_idx -= 1
            if d == 'b':
                if t.cells[r_idx][c_idx].spanning_v:
                    while not t.cells[r_idx][c_idx].bottom:
                        r_idx += 1
        indices.append((r_idx, c_idx, text))
    return indices
 def _fill_spanning(t, fill=None):
    """Fills spanning cells.
    Parameters
    ----------
    t : object
        camelot.table.Table
    fill : list
        {'h', 'v'}
        Specify to fill spanning cells in horizontal or vertical
        direction.
        (optional, default: None)
    Returns
    -------
    t : object
        camelot.table.Table
    """
    for f in fill:
        if f == "h":
            for i in range(len(t.cells)):
                for j in range(len(t.cells[i])):
                    if t.cells[i][j].get_text().strip() == '':
                        if t.cells[i][j].spanning_h and not t.cells[i][j].left:
                            t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
        elif f == "v":
            for i in range(len(t.cells)):
                for j in range(len(t.cells[i])):
                    if t.cells[i][j].get_text().strip() == '':
                        if t.cells[i][j].spanning_v and not t.cells[i][j].top:
                            t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
    return t
 class Lattice:
    """Lattice looks for lines in the pdf to form a table.
@ -617,7 +538,88 @@ class Lattice:
        self.shift_text = shift_text
        self.debug = debug
-    def get_tables(self, pdfname):
+    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
        cell.
        Parameters
        ----------
        table : object
            camelot.table.Table
        idx : list
            List of tuples of the form (r_idx, c_idx, text).
        shift_text : list
            {'l', 'r', 't', 'b'}
            Select one or more from above and pass them as a list to
            specify where the text in a spanning cell should flow.
        Returns
        -------
        indices : list
            List of tuples of the form (idx, text) where idx is the reduced
            index of row/column and text is the an lttextline substring.
        """
        indices = []
        for r_idx, c_idx, text in idx:
            for d in shift_text:
                if d == 'l':
                    if t.cells[r_idx][c_idx].spanning_h:
                        while not t.cells[r_idx][c_idx].left:
                            c_idx -= 1
                if d == 'r':
                    if t.cells[r_idx][c_idx].spanning_h:
                        while not t.cells[r_idx][c_idx].right:
                            c_idx += 1
                if d == 't':
                    if t.cells[r_idx][c_idx].spanning_v:
                        while not t.cells[r_idx][c_idx].top:
                            r_idx -= 1
                if d == 'b':
                    if t.cells[r_idx][c_idx].spanning_v:
                        while not t.cells[r_idx][c_idx].bottom:
                            r_idx += 1
            indices.append((r_idx, c_idx, text))
        return indices
    def _fill_spanning(t, fill=None):
        """Fills spanning cells.
        Parameters
        ----------
        t : object
            camelot.table.Table
        fill : list
            {'h', 'v'}
            Specify to fill spanning cells in horizontal or vertical
            direction.
            (optional, default: None)
        Returns
        -------
        t : object
            camelot.table.Table
        """
        for f in fill:
            if f == "h":
                for i in range(len(t.cells)):
                    for j in range(len(t.cells[i])):
                        if t.cells[i][j].get_text().strip() == '':
                            if t.cells[i][j].spanning_h and not t.cells[i][j].left:
                                t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
            elif f == "v":
                for i in range(len(t.cells)):
                    for j in range(len(t.cells[i])):
                        if t.cells[i][j].get_text().strip() == '':
                            if t.cells[i][j].spanning_v and not t.cells[i][j].top:
                                t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
        return t
    def extract_tables(self, pdfname):
        """Expects a single page pdf as input with rotation corrected.
        Parameters
@ -696,15 +698,16 @@ class Lattice:
        else:
            jtolerance = copy.deepcopy(self.jtol)
        g = Geometry()
        if self.debug:
-            self.debug_images = (img, table_bbox)
+            g.images = [(img, table_bbox)]
        table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
            h_segments, factors_pdf)
        if self.debug:
-            self.debug_segments = (v_segments, h_segments)
+            g.segments = [(v_segments, h_segments)]
-            self.debug_tables = []
+            _tables = []
        page = {}
        tables = {}
@ -737,15 +740,13 @@ class Lattice:
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
            table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
            nouse = table.nocont_ / (len(v_s) + len(h_s))
            table_data['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
            table = table.set_border_edges()
            if self.debug:
-                self.debug_tables.append(table)
+                _tables.append(table)
            assignment_errors = []
            table_data['split_text'] = []
@ -757,7 +758,7 @@ class Lattice:
                        flag_size=self.flag_size)
                    if indices[:2] != (-1, -1):
                        assignment_errors.append(error)
-                        indices = _reduce_index(table, indices, shift_text=self.shift_text)
+                        indices = self._reduce_index(table, indices, shift_text=self.shift_text)
                        if len(indices) > 1:
                            table_data['split_text'].append(indices)
                        for r_idx, c_idx, text in indices:
@ -768,7 +769,7 @@ class Lattice:
            table_data['score'] = score
            if self.fill is not None:
-                table = _fill_spanning(table, fill=self.fill)
+                table = self._fill_spanning(table, fill=self.fill)
            ar = table.get_list()
            ar = encode_list(ar)
            table_data['data'] = ar
@ -782,6 +783,7 @@ class Lattice:
        page[os.path.basename(bname)] = tables
        if self.debug:
-            return None
+            g.tables = _tables
            return [None], [g]
        return page
--- a/camelot/plot.py
+++ b/camelot/plot.py
@ -0,0 +1,98 @@
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from .handlers import PDFHandler
 def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
    # explicit type conversion
    p = PDFHandler(filepath, pages)
    kwargs.update({'debug': geometry_type})
    __, geometry = p.parse(mesh=mesh, **kwargs)
    if geometry_type == 'text':
        for text in geometry.text:
            fig = plt.figure()
            ax = fig.add_subplot(111, aspect='equal')
            xs, ys = [], []
            for t in text:
                xs.extend([t[0], t[1]])
                ys.extend([t[2], t[3]])
                ax.add_patch(
                    patches.Rectangle(
                        (t[0], t[1]),
                        t[2] - t[0],
                        t[3] - t[1]
                    )
                )
            ax.set_xlim(min(xs) - 10, max(xs) + 10)
            ax.set_ylim(min(ys) - 10, max(ys) + 10)
            plt.show()
    elif geometry_type == 'contour':
        try:
            for img, table_bbox in geometry.images:
                for t in table_bbox.keys():
                    cv2.rectangle(img, (t[0], t[1]),
                                  (t[2], t[3]), (255, 0, 0), 3)
                plt.imshow(img)
                plt.show()
        except AttributeError:
            raise ValueError("This option can only be used with Lattice.")
    elif geometry_type == 'joint':
        try:
            for img, table_bbox in geometry.images:
                x_coord = []
                y_coord = []
                for k in table_bbox.keys():
                    for coord in table_bbox[k]:
                        x_coord.append(coord[0])
                        y_coord.append(coord[1])
                max_x, max_y = max(x_coord), max(y_coord)
                plt.plot(x_coord, y_coord, 'ro')
                plt.axis([0, max_x + 100, max_y + 100, 0])
                plt.imshow(img)
                plt.show()
        except AttributeError:
            raise ValueError("This option can only be used with Lattice.")
    elif geometry_type == 'line':
        try:
            for v_s, h_s in geometry.segments:
                for v in v_s:
                    plt.plot([v[0], v[2]], [v[1], v[3]])
                for h in h_s:
                    plt.plot([h[0], h[2]], [h[1], h[3]])
                plt.show()
        except AttributeError:
            raise ValueError("This option can only be used with Lattice.")
    elif geometry_type == 'table':
        try:
            for tables in geometry.tables:
                for table in tables:
                    for r in range(len(table.rows)):
                        for c in range(len(table.cols)):
                            if table.cells[r][c].left:
                                plt.plot([table.cells[r][c].lb[0],
                                          table.cells[r][c].lt[0]],
                                         [table.cells[r][c].lb[1],
                                          table.cells[r][c].lt[1]])
                            if table.cells[r][c].right:
                                plt.plot([table.cells[r][c].rb[0],
                                          table.cells[r][c].rt[0]],
                                         [table.cells[r][c].rb[1],
                                          table.cells[r][c].rt[1]])
                            if table.cells[r][c].top:
                                plt.plot([table.cells[r][c].lt[0],
                                          table.cells[r][c].rt[0]],
                                         [table.cells[r][c].lt[1],
                                          table.cells[r][c].rt[1]])
                            if table.cells[r][c].bottom:
                                plt.plot([table.cells[r][c].lb[0],
                                          table.cells[r][c].rb[0]],
                                         [table.cells[r][c].lb[1],
                                          table.cells[r][c].rb[1]])
                plt.show()
        except AttributeError:
            raise ValueError("This option can only be used with Lattice.")
    else:
        raise UserWarning("This method can only be called after"
            " debug has been specified.")