Merge 42f8321c8c into 4b08165328

2020-07-21 13:15:53 +02:00 · 2020-07-21 13:15:53 +02:00 · d392000a5f
parent 4b08165328 42f8321c8c
commit d392000a5f
42 changed files with 5180 additions and 1034 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -18,7 +18,7 @@ logger = logging.getLogger("camelot")
 logger.setLevel(logging.INFO)
-class Config(object):
+class Config():
    def __init__(self):
        self.config = {}
@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)
@click.group(name="camelot")
@click.version_option(version=__version__)
-@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
+@click.option("-q", "--quiet", is_flag=False,
              help="Suppress logs and warnings.")
@click.option(
    "-p",
    "--pages",
@ -57,7 +58,7 @@ pass_config = click.make_pass_decorator(Config)
    "-flag",
    "--flag_size",
    is_flag=True,
-    help="Flag text based on" " font size. Useful to detect super/subscripts.",
+    help="Flag text based on font size. Useful to detect super/subscripts.",
 )
@click.option(
    "-strip",
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
 )
@click.option(
-    "-back", "--process_background", is_flag=True, help="Process background lines."
+    "-back", "--process_background", is_flag=True,
    help="Process background lines."
 )
@click.option(
    "-scale",
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
    "-l",
    "--line_tol",
    default=2,
-    help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
+    help="Tolerance parameter used to merge close vertical"
    " and horizontal lines.",
 )
@click.option(
    "-j",
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
            raise ImportError("matplotlib is required for plotting.")
    else:
        if output is None:
-            raise click.UsageError("Please specify output file path using --output")
+            raise click.UsageError(
                "Please specify output file path using --output")
        if f is None:
-            raise click.UsageError("Please specify output file format using --format")
+            raise click.UsageError(
                "Please specify output file format using --format")
    tables = read_pdf(
-        filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
+        filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
        **kwargs
    )
    click.echo(f"Found {tables.n} tables")
    if plot_type is not None:
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
    "-r",
    "--row_tol",
    default=2,
-    help="Tolerance parameter" " used to combine text vertically, to generate rows.",
+    help="Tolerance parameter"
         " used to combine text vertically, to generate rows.",
 )
@click.option(
    "-c",
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
            raise ImportError("matplotlib is required for plotting.")
    else:
        if output is None:
-            raise click.UsageError("Please specify output file path using --output")
+            raise click.UsageError(
                "Please specify output file path using --output")
        if f is None:
-            raise click.UsageError("Please specify output file format using --format")
+            raise click.UsageError(
                "Please specify output file format using --format")
    tables = read_pdf(
        filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
@ -302,3 +311,98 @@ def stream(c, *args, **kwargs):
            plt.show()
    else:
        tables.export(output, f=f, compress=compress)
@cli.command("network")
@click.option(
    "-R",
    "--table_regions",
    default=[],
    multiple=True,
    help="Page regions to analyze. Example: x1,y1,x2,y2"
    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
 )
@click.option(
    "-T",
    "--table_areas",
    default=[],
    multiple=True,
    help="Table areas to process. Example: x1,y1,x2,y2"
    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
 )
@click.option(
    "-C",
    "--columns",
    default=[],
    multiple=True,
    help="X coordinates of column separators.",
 )
@click.option(
    "-e",
    "--edge_tol",
    default=50,
    help="Tolerance parameter" " for extending textedges vertically.",
 )
@click.option(
    "-r",
    "--row_tol",
    default=2,
    help="Tolerance parameter"
         " used to combine text vertically, to generate rows.",
 )
@click.option(
    "-c",
    "--column_tol",
    default=0,
    help="Tolerance parameter"
    " used to combine text horizontally, to generate columns.",
 )
@click.option(
    "-plot",
    "--plot_type",
    type=click.Choice(["text", "grid", "contour", "textedge"]),
    help="Plot elements found on PDF page for visual debugging.",
 )
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
 def network(c, *args, **kwargs):
    """Use spaces between text to parse the table."""
    conf = c.config
    pages = conf.pop("pages")
    output = conf.pop("output")
    f = conf.pop("format")
    compress = conf.pop("zip")
    quiet = conf.pop("quiet")
    plot_type = kwargs.pop("plot_type")
    filepath = kwargs.pop("filepath")
    kwargs.update(conf)
    table_regions = list(kwargs["table_regions"])
    kwargs["table_regions"] = None if not table_regions else table_regions
    table_areas = list(kwargs["table_areas"])
    kwargs["table_areas"] = None if not table_areas else table_areas
    columns = list(kwargs["columns"])
    kwargs["columns"] = None if not columns else columns
    if plot_type is not None:
        if not _HAS_MPL:
            raise ImportError("matplotlib is required for plotting.")
    else:
        if output is None:
            raise click.UsageError(
                "Please specify output file path using --output")
        if f is None:
            raise click.UsageError(
                "Please specify output file format using --format")
    tables = read_pdf(
        filepath, pages=pages, flavor="network",
        suppress_stdout=quiet, **kwargs
    )
    click.echo(f"Found {tables.n} tables")
    if plot_type is not None:
        for table in tables:
            plot(table, kind=plot_type)
            plt.show()
    else:
        tables.export(output, f=f, compress=compress)
--- a/camelot/core.py
+++ b/camelot/core.py
@ -4,12 +4,20 @@ import os
 import sqlite3
 import zipfile
 import tempfile
 from itertools import chain
 from operator import itemgetter
 import numpy as np
 import pandas as pd
 from cv2 import cv2
 from .utils import (
    get_index_closest_point,
    get_textline_coords,
    build_file_path_in_temp_dir,
    export_pdf_as_png
 )
 # minimum number of vertical textline intersections for a textedge
 # to be considered valid
@ -18,14 +26,70 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
 TABLE_AREA_PADDING = 10
-class TextEdge(object):
+HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
-    """Defines a text edge coordinates relative to a left-bottom
+VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
-    origin. (PDF coordinate space)
+ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
 class TextAlignment():
    """Represents a list of textlines sharing an alignment on a coordinate.
    The alignment can be left/right/middle or top/bottom/center.
    (PDF coordinate space)
    Parameters
    ----------
-    x : float
+    coord : float
-        x-coordinate of the text edge.
+        coordinate of the initial text edge. Depending on the alignment
        it could be a vertical or horizontal coordinate.
    textline : obj
        the original textline to start the alignment
    align : str
        Name of the alignment (e.g. "left", "top", etc)
    Attributes
    ----------
    coord : float
        The coordinate aligned averaged out across textlines.  It can be along
        the x or y axis.
    textlines : array
        Array of textlines that demonstrate this alignment.
    align : str
        Name of the alignment (e.g. "left", "top", etc)
    """
    def __init__(self, coord, textline, align):
        self.coord = coord
        self.textlines = [textline]
        self.align = align
    def __repr__(self):
        text_inside = " | ".join(
            map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
        return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
               f"textlines text='{text_inside}...'>"
    def register_aligned_textline(self, textline, coord):
        """Update new textline to this alignment, adapting its average."""
        # Increase the intersections for this segment, expand it up,
        # and adjust the x based on the new value
        self.coord = (self.coord * len(self.textlines) + coord) / \
            float(len(self.textlines) + 1)
        self.textlines.append(textline)
 class TextEdge(TextAlignment):
    """Defines a text edge coordinates relative to a left-bottom
    origin. (PDF coordinate space).
    An edge is an alignment bounded over a segment.
    Parameters
    ----------
    coord : float
        coordinate of the text edge.  Can be x or y.
    y0 : float
        y-coordinate of bottommost point.
    y1 : float
@ -35,101 +99,120 @@ class TextEdge(object):
    Attributes
    ----------
    intersections: int
        Number of intersections with horizontal text rows.
    is_valid: bool
-        A text edge is valid if it intersections with at least
+        A text edge is valid if it intersects with at least
        TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
    """
-    def __init__(self, x, y0, y1, align="left"):
+    def __init__(self, coord, textline, align):
-        self.x = x
+        super().__init__(coord, textline, align)
-        self.y0 = y0
+        self.y0 = textline.y0
-        self.y1 = y1
+        self.y1 = textline.y1
        self.align = align
        self.intersections = 0
        self.is_valid = False
    def __repr__(self):
-        x = round(self.x, 2)
+        x = round(self.coord, 2)
        y0 = round(self.y0, 2)
        y1 = round(self.y1, 2)
-        return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} valid={self.is_valid}>"
+        return f"<TextEdge x={x} y0={y0} y1={y1} align={self.align} " \
            f"valid={self.is_valid}>"
-    def update_coords(self, x, y0, edge_tol=50):
+    def update_coords(self, x, textline, edge_tol=50):
        """Updates the text edge's x and bottom y coordinates and sets
        the is_valid attribute.
        """
-        if np.isclose(self.y0, y0, atol=edge_tol):
+        if np.isclose(self.y0, textline.y0, atol=edge_tol):
-            self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
+            self.register_aligned_textline(textline, x)
-            self.y0 = y0
+            self.y0 = textline.y0
            self.intersections += 1
            # a textedge is valid only if it extends uninterrupted
            # over a required number of textlines
-            if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
+            if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
                self.is_valid = True
-class TextEdges(object):
+class TextAlignments():
    """Defines a dict of text edges across reference alignments.
    """
    def __init__(self, alignment_names):
        # For each possible alignment, list of tuples coordinate/textlines
        self._text_alignments = {}
        for alignment_name in alignment_names:
            self._text_alignments[alignment_name] = []
    @staticmethod
    def _create_new_text_alignment(coord, textline, align):
        return TextAlignment(coord, textline, align)
    def _update_alignment(self, alignment, coord, textline):
        return NotImplemented
    def _register_textline(self, textline):
        """Updates an existing text edge in the current dict.
        """
        coords = get_textline_coords(textline)
        for alignment_id, alignment_array in self._text_alignments.items():
            coord = coords[alignment_id]
            # Find the index of the closest existing element (or 0 if none)
            idx_closest = get_index_closest_point(
                coord, alignment_array, fn=lambda x: x.coord
            )
            # Check if the edges before/after are close enough
            # that it can be considered aligned
            idx_insert = None
            if idx_closest is None:
                idx_insert = 0
            else:
                coord_closest = alignment_array[idx_closest].coord
                # Note: np.isclose is slow!
                if coord - 0.5 < coord_closest < coord + 0.5:
                    self._update_alignment(
                        alignment_array[idx_closest],
                        coord,
                        textline
                    )
                elif coord_closest < coord:
                    idx_insert = idx_closest + 1
                else:
                    idx_insert = idx_closest
            if idx_insert is not None:
                new_alignment = self._create_new_text_alignment(
                    coord, textline, alignment_id
                )
                alignment_array.insert(idx_insert, new_alignment)
 class TextEdges(TextAlignments):
    """Defines a dict of left, right and middle text edges found on
    the PDF page. The dict has three keys based on the alignments,
    and each key's value is a list of camelot.core.TextEdge objects.
    """
    def __init__(self, edge_tol=50):
        super().__init__(HORIZONTAL_ALIGNMENTS)
        self.edge_tol = edge_tol
        self._textedges = {"left": [], "right": [], "middle": []}
    @staticmethod
-    def get_x_coord(textline, align):
+    def _create_new_text_alignment(coord, textline, align):
-        """Returns the x coordinate of a text row based on the
+        # In TextEdges, each alignment is a TextEdge
-        specified alignment.
+        return TextEdge(coord, textline, align)
        """
        x_left = textline.x0
        x_right = textline.x1
        x_middle = x_left + (x_right - x_left) / 2.0
        x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
        return x_coord[align]
-    def find(self, x_coord, align):
+    def add(self, coord, textline, align):
-        """Returns the index of an existing text edge using
+        """Adds a new text edge to the current dict."""
-        the specified x coordinate and alignment.
+        te = self._create_new_text_alignment(coord, textline, align)
-        """
+        self._text_alignments[align].append(te)
        for i, te in enumerate(self._textedges[align]):
            if np.isclose(te.x, x_coord, atol=0.5):
                return i
        return None
-    def add(self, textline, align):
+    def _update_alignment(self, alignment, coord, textline):
-        """Adds a new text edge to the current dict.
+        alignment.update_coords(coord, textline, self.edge_tol)
        """
        x = self.get_x_coord(textline, align)
        y0 = textline.y0
        y1 = textline.y1
        te = TextEdge(x, y0, y1, align=align)
        self._textedges[align].append(te)
    def update(self, textline):
        """Updates an existing text edge in the current dict.
        """
        for align in ["left", "right", "middle"]:
            x_coord = self.get_x_coord(textline, align)
            idx = self.find(x_coord, align)
            if idx is None:
                self.add(textline, align)
            else:
                self._textedges[align][idx].update_coords(
                    x_coord, textline.y0, edge_tol=self.edge_tol
                )
    def generate(self, textlines):
-        """Generates the text edges dict based on horizontal text
+        """Generates the text edges dict based on horizontal text rows."""
        rows.
        """
        for tl in textlines:
            if len(tl.get_text().strip()) > 1:  # TODO: hacky
-                self.update(tl)
+                self._register_textline(tl)
    def get_relevant(self):
        """Returns the list of relevant text edges (all share the same
@ -138,13 +221,16 @@ class TextEdges(object):
        """
        intersections_sum = {
            "left": sum(
-                te.intersections for te in self._textedges["left"] if te.is_valid
+                len(te.textlines) for te in self._text_alignments["left"]
                if te.is_valid
            ),
            "right": sum(
-                te.intersections for te in self._textedges["right"] if te.is_valid
+                len(te.textlines) for te in self._text_alignments["right"]
                if te.is_valid
            ),
            "middle": sum(
-                te.intersections for te in self._textedges["middle"] if te.is_valid
+                len(te.textlines) for te in self._text_alignments["middle"]
                if te.is_valid
            ),
        }
@ -152,7 +238,10 @@ class TextEdges(object):
        # get vertical textedges that intersect maximum number of
        # times with horizontal textlines
        relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
-        return self._textedges[relevant_align]
+        return list(filter(
            lambda te: te.is_valid,
            self._text_alignments[relevant_align])
        )
    def get_table_areas(self, textlines, relevant_textedges):
        """Returns a dict of interesting table areas on the PDF page
@ -168,31 +257,30 @@ class TextEdges(object):
            return (x0, y0, x1, y1)
        # sort relevant textedges in reading order
-        relevant_textedges.sort(key=lambda te: (-te.y0, te.x))
+        relevant_textedges.sort(key=lambda te: (-te.y0, te.coord))
        table_areas = {}
        for te in relevant_textedges:
-            if te.is_valid:
+            if not table_areas:
-                if not table_areas:
+                table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
-                    table_areas[(te.x, te.y0, te.x, te.y1)] = None
+            else:
                found = None
                for area in table_areas:
                    # check for overlap
                    if te.y1 >= area[1] and te.y0 <= area[3]:
                        found = area
                        break
                if found is None:
                    table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
                else:
-                    found = None
+                    table_areas.pop(found)
-                    for area in table_areas:
+                    updated_area = (
-                        # check for overlap
+                        found[0],
-                        if te.y1 >= area[1] and te.y0 <= area[3]:
+                        min(te.y0, found[1]),
-                            found = area
+                        max(found[2], te.coord),
-                            break
+                        max(found[3], te.y1),
-                    if found is None:
+                    )
-                        table_areas[(te.x, te.y0, te.x, te.y1)] = None
+                    table_areas[updated_area] = None
                    else:
                        table_areas.pop(found)
                        updated_area = (
                            found[0],
                            min(te.y0, found[1]),
                            max(found[2], te.x),
                            max(found[3], te.y1),
                        )
                        table_areas[updated_area] = None
        # extend table areas based on textlines that overlap
        # vertically. it's possible that these textlines were
@ -218,7 +306,8 @@ class TextEdges(object):
                    max(found[3], tl.y1),
                )
                table_areas[updated_area] = None
-        average_textline_height = sum_textline_height / float(len(textlines))
+        average_textline_height = sum_textline_height / \
            float(len(textlines))
        # add some padding to table areas
        table_areas_padded = {}
@ -228,7 +317,7 @@ class TextEdges(object):
        return table_areas_padded
-class Cell(object):
+class Cell():
    """Defines a cell in a table with coordinates relative to a
    left-bottom origin. (PDF coordinate space)
@ -304,14 +393,13 @@ class Cell(object):
    @property
    def bound(self):
-        """The number of sides on which the cell is bounded.
+        """The number of sides on which the cell is bounded."""
        """
        return self.top + self.bottom + self.left + self.right
-class Table(object):
+class Table():
-    """Defines a table with coordinates relative to a left-bottom
+    """Defines a table with coordinates relative to a left-bottom origin.
-    origin. (PDF coordinate space)
+    (PDF coordinate space)
    Parameters
    ----------
@ -331,6 +419,8 @@ class Table(object):
        Accuracy with which text was assigned to the cell.
    whitespace : float
        Percentage of whitespace in the table.
    filename : str
        Path of the original PDF
    order : int
        Table number on PDF page.
    page : int
@ -341,13 +431,27 @@ class Table(object):
    def __init__(self, cols, rows):
        self.cols = cols
        self.rows = rows
-        self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows]
+        self.cells = [
            [Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows
        ]
        self.df = None
        self.shape = (0, 0)
        self.accuracy = 0
        self.whitespace = 0
        self.filename = None
        self.order = None
        self.page = None
        self.flavor = None         # Flavor of the parser used
        self.pdf_size = None       # Dimensions of the original PDF page
        self._bbox = None          # Bounding box in original document
        self.parse = None          # Parse information
        self.parse_details = None  # Field holding extra debug data
        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf
        self._text = []      # List of text box coordinates
        self.textlines = []  # List of actual textlines on the page
    def __repr__(self):
        return f"<{self.__class__.__name__} shape={self.shape}>"
@ -356,8 +460,7 @@ class Table(object):
        if self.page == other.page:
            if self.order < other.order:
                return True
-        if self.page < other.page:
+        return self.page < other.page
            return True
    @property
    def data(self):
@ -382,6 +485,19 @@ class Table(object):
        }
        return report
    def get_pdf_image(self):
        """Compute pdf image and cache it
        """
        if self._image is None:
            if self._image_path is None:
                self._image_path = build_file_path_in_temp_dir(
                    os.path.basename(self.filename),
                    ".png"
                )
                export_pdf_as_png(self.filename, self._image_path)
            self._image = cv2.imread(self._image_path)
        return self._image
    def set_all_edges(self):
        """Sets all table edges to True.
        """
@ -548,7 +664,7 @@ class Table(object):
                bottom = cell.bottom
                if cell.bound == 4:
                    continue
-                elif cell.bound == 3:
+                if cell.bound == 3:
                    if not left and (right and top and bottom):
                        cell.hspan = True
                    elif not right and (left and top and bottom):
@ -578,7 +694,8 @@ class Table(object):
            Output filepath.
        """
-        kw = {"encoding": "utf-8", "index": False, "header": False, "quoting": 1}
+        kw = {"encoding": "utf-8", "index": False, "header": False,
              "quoting": 1}
        kw.update(kwargs)
        self.df.to_csv(path, **kw)
@ -615,6 +732,7 @@ class Table(object):
            "encoding": "utf-8",
        }
        kw.update(kwargs)
        # pylint: disable=abstract-class-instantiated
        writer = pd.ExcelWriter(path)
        self.df.to_excel(writer, **kw)
        writer.save()
@ -653,8 +771,41 @@ class Table(object):
        conn.commit()
        conn.close()
    def copy_spanning_text(self, copy_text=None):
        """Copies over text in empty spanning cells.
-class TableList(object):
+        Parameters
        ----------
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.
        Returns
        -------
        t : camelot.core.Table
        """
        for f in copy_text:
            if f == "h":
                for i, row in enumerate(self.cells):
                    for j, cell in enumerate(row):
                        if cell.text.strip() == "" and \
                           cell.hspan and \
                           not cell.left:
                            cell.text = self.cells[i][j - 1].text
            elif f == "v":
                for i, row in enumerate(self.cells):
                    for j, cell in enumerate(row):
                        if cell.text.strip() == "" and \
                           cell.vspan and \
                           not cell.top:
                            cell.text = self.cells[i - 1][j].text
        return self
 class TableList():
    """Defines a list of camelot.core.Table objects. Each table can
    be accessed using its index.
@ -734,10 +885,15 @@ class TableList(object):
                self._compress_dir(**kwargs)
        elif f == "excel":
            filepath = os.path.join(dirname, basename)
            # pylint: disable=abstract-class-instantiated
            writer = pd.ExcelWriter(filepath)
            for table in self._tables:
                sheet_name = f"page-{table.page}-table-{table.order}"
-                table.df.to_excel(writer, sheet_name=sheet_name, encoding="utf-8")
+                table.df.to_excel(
                    writer,
                    sheet_name=sheet_name,
                    encoding="utf-8"
                )
            writer.save()
            if compress:
                zipname = os.path.join(os.path.dirname(path), root) + ".zip"
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -2,13 +2,14 @@
 import os
 import sys
 import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
-from .parsers import Stream, Lattice
+from .parsers import Stream, Lattice, Network, Hybrid
 from .utils import (
-    TemporaryDirectory,
+    build_file_path_in_temp_dir,
    get_page_layout,
    get_text_objects,
    get_rotation,
@ -16,8 +17,17 @@ from .utils import (
    download_url,
 )
 logger = logging.getLogger("camelot")
-class PDFHandler(object):
+PARSERS = {
    "lattice": Lattice,
    "stream": Stream,
    "network": Network,
    "hybrid": Hybrid,
 }
 class PDFHandler():
    """Handles all operations like temp directory creation, splitting
    file into single page PDFs, parsing each PDF and then removing the
    temp directory.
@ -31,10 +41,13 @@ class PDFHandler(object):
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    debug : bool, optional (default: False)
        Whether the parser should store debug information during parsing.
    """
-    def __init__(self, filepath, pages="1", password=None):
+    def __init__(self, filepath, pages="1", password=None, debug=False):
        self.debug = debug
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
@ -89,38 +102,54 @@ class PDFHandler(object):
            P.extend(range(p["start"], p["end"] + 1))
        return sorted(set(P))
-    def _save_page(self, filepath, page, temp):
+    def _read_pdf_page(self, page=1, layout_kwargs=None):
-        """Saves specified page from PDF into a temporary directory.
+        """Saves specified page from PDF into a temporary directory. Removes
        password protection and normalizes rotation.
        Parameters
        ----------
        filepath : str
            Filepath or URL of the PDF file.
        page : int
            Page number.
-        temp : str
+        layout_kwargs : dict, optional (default: {})
-            Tmp directory.
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.  # noqa
        Returns
        -------
        layout : object
        dimensions : tuple
            The dimensions of the pdf page
        filepath : str
            The path of the single page PDF - either the original, or a
            normalized version.
        """
-        with open(filepath, "rb") as fileobj:
+        layout_kwargs = layout_kwargs or {}
        with open(self.filepath, "rb") as fileobj:
            # Normalize the pdf file, but skip if it's not encrypted or has
            # only one page.
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
-            fpath = os.path.join(temp, f"page-{page}.pdf")
+            fpath = build_file_path_in_temp_dir(f"page-{page}.pdf")
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
            outfile.addPage(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
-            layout, dim = get_page_layout(fpath)
+            layout, dimensions = get_page_layout(
                fpath, **layout_kwargs)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
-                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
+                fpath_new = "".join(
                    [froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
                if infile.isEncrypted:
@ -134,9 +163,13 @@ class PDFHandler(object):
                outfile.addPage(p)
                with open(fpath, "wb") as f:
                    outfile.write(f)
                layout, dimensions = get_page_layout(
                    fpath, **layout_kwargs)
        return layout, dimensions, fpath
    def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
+        self, flavor="lattice", suppress_stdout=False,
        layout_kwargs=None, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
@ -144,12 +177,13 @@ class PDFHandler(object):
        Parameters
        ----------
        flavor : str (default: 'lattice')
-            The parsing method to use ('lattice' or 'stream').
+            The parsing method to use ('lattice', 'stream', 'network',
            or 'hybrid').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
-            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. # noqa
        kwargs : dict
            See camelot.read_pdf kwargs.
@ -159,17 +193,24 @@ class PDFHandler(object):
            List of tables found in PDF.
        """
        layout_kwargs = layout_kwargs or {}
        tables = []
-        with TemporaryDirectory() as tempdir:
+
-            for p in self.pages:
+        parser_obj = PARSERS[flavor]
-                self._save_page(self.filepath, p, tempdir)
+        parser = parser_obj(debug=self.debug, **kwargs)
-            pages = [
+
-                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
+        # Read the layouts/dimensions of each of the pages we need to
-            ]
+        # parse. This might require creating a temporary .pdf.
-            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
+        for page_idx in self.pages:
-            for p in pages:
+            layout, dimensions, source_file = self._read_pdf_page(
-                t = parser.extract_tables(
+                page_idx,
-                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+                layout_kwargs=layout_kwargs
-                )
+            )
-                tables.extend(t)
+            parser.prepare_page_parse(source_file, layout, dimensions,
                                      page_idx, layout_kwargs)
            if not suppress_stdout:
                rootname = os.path.basename(parser.rootname)
                logger.info(f"Processing {rootname}")
            t = parser.extract_tables()
            tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -6,7 +6,6 @@ import numpy as np
 def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
    ----------
    imagename : string
@ -16,21 +15,17 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    Returns
    -------
    img : object
        numpy.ndarray representing the original image.
    threshold : object
        numpy.ndarray representing the thresholded image.
    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@ -56,7 +51,6 @@ def find_lines(
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
    Parameters
    ----------
    threshold : object
@ -70,14 +64,11 @@ def find_lines(
    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    Returns
    -------
    dmask : object
@ -87,7 +78,6 @@ def find_lines(
        List of tuples representing vertical/horizontal lines with
        coordinates relative to a left-top origin in
        image coordinate space.
    """
    lines = []
@ -135,21 +125,18 @@ def find_lines(
 def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.
    Parameters
    ----------
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    cont : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in image coordinate space.
    """
    mask = vertical + horizontal
@ -175,7 +162,6 @@ def find_contours(vertical, horizontal):
 def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.
    Parameters
    ----------
    contours : list
@ -186,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    tables : dict
@ -194,7 +179,6 @@ def find_joints(contours, vertical, horizontal):
        in that boundary as their value.
        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
        and (x2, y2) -> rt in image coordinate space.
    """
    joints = np.multiply(vertical, horizontal)
    tables = {}
--- a/camelot/io.py
+++ b/camelot/io.py
@ -7,14 +7,14 @@ from .utils import validate_input, remove_extra
 def read_pdf(
-    filepath,
+        filepath,
-    pages="1",
+        pages="1",
-    password=None,
+        password=None,
-    flavor="lattice",
+        flavor="lattice",
-    suppress_stdout=False,
+        suppress_stdout=False,
-    layout_kwargs={},
+        layout_kwargs=None,
-    **kwargs
+        debug=False,
-):
+        **kwargs):
    """Read PDF and return extracted tables.
    Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -80,16 +80,16 @@ def read_pdf(
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
    threshold_constant* : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
    iterations* : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
-        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
    resolution* : int, optional (default: 300)
        Resolution used for PDF to PNG conversion.
@ -98,9 +98,11 @@ def read_pdf(
    tables : camelot.core.TableList
    """
-    if flavor not in ["lattice", "stream"]:
+    layout_kwargs = layout_kwargs or {}
    if flavor not in ["lattice", "stream", "network", "hybrid"]:
        raise NotImplementedError(
-            "Unknown flavor specified." " Use either 'lattice' or 'stream'"
+            "Unknown flavor specified."
            " Use either 'lattice', 'stream', or 'network'"
        )
    with warnings.catch_warnings():
@ -108,7 +110,7 @@ def read_pdf(
            warnings.simplefilter("ignore")
        validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
--- a/camelot/parsers/init.py
+++ b/camelot/parsers/init.py
@ -2,3 +2,5 @@
 from .stream import Stream
 from .lattice import Lattice
 from .network import Network
 from .hybrid import Hybrid
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -1,20 +1,484 @@
 # -*- coding: utf-8 -*-
 import os
 import warnings
-from ..utils import get_page_layout, get_text_objects
+import numpy as np
 import pandas as pd
 from ..utils import (
    bbox_from_str,
    compute_accuracy,
    compute_whitespace,
    get_text_objects,
    get_table_index,
    text_in_bbox,
 )
 from ..core import Table
-class BaseParser(object):
+class BaseParser():
    """Defines a base parser.
    """
    def __init__(
            self,
            parser_id,
            table_regions=None,
            table_areas=None,
            copy_text=None,
            split_text=False,
            strip_text="",
            shift_text=None,
            flag_size=False,
            debug=False):
        self.id = parser_id
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.table_bbox_parses = {}
-    def _generate_layout(self, filename, layout_kwargs):
+        self.columns = None
        self.copy_text = copy_text
        self.split_text = split_text
        self.strip_text = strip_text
        self.shift_text = shift_text
        self.flag_size = flag_size
        self.rootname = None
        self.t_bbox = None
        # For plotting details of parsing algorithms
        self.resolution = 300  # default plotting resolution of the PDF.
        self.parse_details = {}
        if not debug:
            self.parse_details = None
    def table_bboxes(self):
        return sorted(
            self.table_bbox_parses.keys(),
            key=lambda x: x[1],
            reverse=True
        )
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
        self.filename = filename
        self.layout_kwargs = layout_kwargs
-        self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
+        self.layout = layout
        self.dimensions = dimensions
        self.page = page_idx
        self.images = get_text_objects(self.layout, ltype="image")
-        self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
+        self.horizontal_text = get_text_objects(
-        self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
+            self.layout,
            ltype="horizontal_text"
        )
        self.vertical_text = get_text_objects(
            self.layout,
            ltype="vertical_text"
        )
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
        if self.parse_details is not None:
            self.parse_details["table_regions"] = self.table_regions
            self.parse_details["table_areas"] = self.table_areas
    def _apply_regions_filter(self, textlines):
        """If regions have been specified, filter textlines to these regions.
        Parameters
        ----------
        textlines : list
            list of textlines to be filtered
        Returns
        -------
        filtered_textlines : list of textlines within the regions specified
        """
        filtered_textlines = []
        if self.table_regions is None:
            filtered_textlines.extend(textlines)
        else:
            for region_str in self.table_regions:
                region_text = text_in_bbox(
                    bbox_from_str(region_str),
                    textlines
                )
                filtered_textlines.extend(region_text)
        return filtered_textlines
    def _document_has_no_text(self):
        """Detects image only documents and warns.
        Returns
        -------
        has_no_text : bool
            Whether the document doesn't have any text at all.
        """
        if not self.horizontal_text:
            rootname = os.path.basename(self.rootname)
            if self.images:
                warnings.warn(
                    "{rootname} is image-based, "
                    "camelot only works on text-based pages."
                    .format(rootname=rootname)
                )
            else:
                warnings.warn(
                    "No tables found on {rootname}".format(rootname=rootname)
                )
            return True
        return False
    def _initialize_new_table(self, table_idx, bbox, cols, rows):
        """Initialize new table object, ready to be populated
        Parameters
        ----------
        table_idx : int
            Index of this table within the pdf page analyzed
        bbox : set
            bounding box of this table within the pdf page analyzed
        cols : list
            list of coordinate boundaries tuples (left, right)
        rows : list
            list of coordinate boundaries tuples (bottom, top)
        Returns
        -------
        table : camelot.core.Table
        """
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
        table._bbox = bbox
        return table
    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
        cell.  Only useful for some parsers (e.g. Lattice), base method is a
        noop.
        """
        return idx
    def compute_parse_errors(self, table):
        pos_errors = []
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    indices = type(self)._reduce_index(
                        table,
                        indices,
                        shift_text=self.shift_text
                    )
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
    def _generate_columns_and_rows(self, bbox, user_cols):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()
    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()
    def _generate_table_bbox(self):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()
    def extract_tables(self):
        if self._document_has_no_text():
            return []
        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, bbox in enumerate(self.table_bboxes()):
            if self.columns is not None and self.columns[table_idx] != "":
                # user has to input boundary columns too
                # take (0, pdf_width) by default
                # similar to else condition
                # len can't be 1
                user_cols = self.columns[table_idx].split(",")
                user_cols = [float(c) for c in user_cols]
            else:
                user_cols = None
            cols, rows, v_s, h_s = self._generate_columns_and_rows(
                bbox,
                user_cols
            )
            table = self._generate_table(
                table_idx, bbox, cols, rows, v_s=v_s, h_s=h_s)
            _tables.append(table)
        return _tables
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        table.flavor = self.id
        table.filename = self.filename
        table.parse = self.table_bbox_parses[table._bbox]
        table.parse_details = self.parse_details
        pos_errors = self.compute_parse_errors(table)
        table.accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
            table.copy_spanning_text(self.copy_text)
        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        table.whitespace = compute_whitespace(data)
        table.pdf_size = (self.pdf_width, self.pdf_height)
        _text = []
        _text.extend(
            [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
        table.textlines = self.horizontal_text + self.vertical_text
 class TextBaseParser(BaseParser):
    """Base class for all text parsers.
    """
    def __init__(
            self,
            parser_id,
            table_regions=None,
            table_areas=None,
            columns=None,
            flag_size=False,
            split_text=False,
            strip_text="",
            edge_tol=50,
            row_tol=2,
            column_tol=0,
            debug=False,
            **kwargs):
        super().__init__(
            parser_id,
            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
            debug=debug,
        )
        self.columns = columns
        self._validate_columns()
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = None
        rows = []
        temp = []
        non_empty_text = [t for t in text if t.get_text().strip()]
        for t in non_empty_text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright \
            #   for obj in t._objs
            # if type(obj) is LTChar]):
            if row_y is None:
                row_y = t.y0
            elif not np.isclose(row_y, t.y0, atol=row_tol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                # We update the row's bottom as we go, to be forgiving if there
                # is a gradual change across multiple columns.
                row_y = t.y0
            temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        return rows
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if column_tol >= 0:
                    if higher[0] <= lower[1] or np.isclose(
                        higher[0], lower[1], atol=column_tol
                    ):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif column_tol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1],
                                      atol=abs(column_tol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous. For the row to "touch"
        we split the existing gap between them in half.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_boundaries = [
            [
                max(t.y1 for t in r),
                min(t.y0 for t in r)
            ]
            for r in rows_grouped
        ]
        for i in range(0, len(row_boundaries)-1):
            top_row = row_boundaries[i]
            bottom_row = row_boundaries[i+1]
            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
        row_boundaries[0][0] = text_y_max
        row_boundaries[-1][1] = text_y_min
        return row_boundaries
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = TextBaseParser._group_rows(text, row_tol=row_tol)
            elements = [len(r) for r in text]
            new_cols = [
                (t.x0, t.x1)
                for r in text if len(r) == max(elements)
                for t in r
            ]
            cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
        return cols
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, bbox, cols, rows)
        table = table.set_all_edges()
        self.record_parse_metadata(table)
        return table
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        # for plotting
        table._segments = None
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -0,0 +1,235 @@
 # -*- coding: utf-8 -*-
 from ..utils import (
    bboxes_overlap,
    boundaries_to_split_lines,
 )
 import numpy as np
 from .base import BaseParser
 from .network import Network
 from .lattice import Lattice
 class Hybrid(BaseParser):
    """Defines a hybrid parser, leveraging both network and lattice parsers.
    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    columns : list, optional (default: None)
        List of column x-coordinates strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    edge_tol : int, optional (default: 50)
        Tolerance parameter for extending textedges vertically.
    row_tol : int, optional (default: 2)
        Tolerance parameter used to combine text vertically,
        to generate rows.
    column_tol : int, optional (default: 0)
        Tolerance parameter used to combine text horizontally,
        to generate columns.
    """
    def __init__(
            self,
            table_regions=None,
            table_areas=None,
            columns=None,
            flag_size=False,
            split_text=False,
            strip_text="",
            edge_tol=None,
            row_tol=2,
            column_tol=0,
            debug=False,
            **kwargs):
        super().__init__(
            "hybrid",
            table_regions=table_regions,
            table_areas=table_areas,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            debug=debug,
        )
        self.columns = columns  # Columns settings impacts the hybrid table
        self.network_parser = Network(
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
        self.lattice_parser = Lattice(
            table_regions=table_regions,
            table_areas=table_areas,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
        super().prepare_page_parse(filename, layout, dimensions,
                                   page_idx, layout_kwargs)
        self.network_parser.prepare_page_parse(
            filename, layout, dimensions, page_idx, layout_kwargs)
        self.lattice_parser.prepare_page_parse(
            filename, layout, dimensions, page_idx, layout_kwargs)
    def _generate_columns_and_rows(self, bbox, table_idx):
        parser = self.table_bbox_parses[bbox]
        return parser._generate_columns_and_rows(bbox, table_idx)
    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        parser = self.table_bbox_parses[bbox]
        table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
        # Because hybrid can inject extraneous splits from both lattice and
        # network, remove lines / cols that are completely empty.
        table.df = table.df.replace('', np.nan)
        table.df = table.df.dropna(axis=0, how="all")
        table.df = table.df.dropna(axis=1, how="all")
        table.df = table.df.replace(np.nan, '')
        table.shape = table.df.shape
        return table
    @staticmethod
    def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
        """ Augment existing boundaries using provided hard splits.
        Boundaries:   |---|    |-| |---------|
        Splits:     |       |     |       |
        Augmented:  |-------|-----|-------|--|
        """
        idx_boundaries = len(boundaries) - 1
        idx_splits = len(splits) - 1
        previous_boundary = None
        while True:
            if idx_splits < 0:
                # No more splits to incorporate, we're done
                break
            split = splits[idx_splits]
            if idx_boundaries < 0:
                # Need to insert remaining splits
                new_boundary = [split, boundaries[0][0]]
                boundaries.insert(0, new_boundary)
                idx_splits = idx_splits - 1
            else:
                boundary = \
                    boundaries[idx_boundaries]
                if boundary[1] < \
                        split + tolerance:
                    # The lattice column is further to the right of our
                    # col boundary.  We move our left boundary to match.
                    boundary[1] = split
                    # And if there was another segment after, we make its
                    # right boundary match as well so that there's no gap
                    if previous_boundary is not None:
                        previous_boundary[0] = split
                    idx_splits = idx_splits - 1
                elif boundary[0] > \
                        split - tolerance:
                    # Our boundary is fully after the split, move on
                    idx_boundaries = idx_boundaries - 1
                    previous_boundary = boundary
                    if idx_boundaries < 0:
                        # If this is the last boundary to the left, set its
                        # edge at the split
                        boundary[0] = split
                        idx_splits = idx_splits - 1
                else:
                    # The split is inside our boundary: split it
                    new_boundary = [split, boundary[1]]
                    boundaries.insert(idx_boundaries + 1, new_boundary)
                    boundary[1] = split
                    previous_boundary = new_boundary
                    idx_splits = idx_splits - 1
        return boundaries
    def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
        """ Identify splits that were only detected by lattice or by network
        """
        lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
        lattice_cols = lattice_parse["col_anchors"]
        network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
        network_cols_boundaries = network_bbox_data["cols_boundaries"]
        # Favor network, but complete or adjust its columns based on the
        # splits identified by lattice.
        if network_cols_boundaries is None:
            self.table_bbox_parses[lattice_bbox] = self.lattice_parser
        else:
            network_cols_boundaries = self._augment_boundaries_with_splits(
                network_cols_boundaries,
                lattice_cols,
                self.lattice_parser.joint_tol)
            augmented_bbox = (
                network_cols_boundaries[0][0],
                min(lattice_bbox[1], network_bbox[1]),
                network_cols_boundaries[-1][1],
                max(lattice_bbox[3], network_bbox[3]),
            )
            network_bbox_data["cols_anchors"] = \
                boundaries_to_split_lines(network_cols_boundaries)
            del self.network_parser.table_bbox_parses[network_bbox]
            self.network_parser.table_bbox_parses[augmented_bbox] = \
                network_bbox_data
            self.table_bbox_parses[augmented_bbox] = self.network_parser
    def _generate_table_bbox(self):
        # Collect bboxes from both parsers
        self.lattice_parser._generate_table_bbox()
        _lattice_bboxes = sorted(
                self.lattice_parser.table_bbox_parses,
                key=lambda bbox: (bbox[0], -bbox[1]))
        self.network_parser._generate_table_bbox()
        _network_bboxes = sorted(
                self.network_parser.table_bbox_parses,
                key=lambda bbox: (bbox[0], -bbox[1]))
        # Merge the data from both processes
        for lattice_bbox in _lattice_bboxes:
            merged = False
            for idx in range(len(_network_bboxes)-1, -1, -1):
                network_bbox = _network_bboxes[idx]
                if not bboxes_overlap(lattice_bbox, network_bbox):
                    continue
                self._merge_bbox_analysis(lattice_bbox, network_bbox)
                # network_bbox_data["cols_boundaries"]
                del _network_bboxes[idx]
                merged = True
            if not merged:
                self.table_bbox_parses[lattice_bbox] = self.lattice_parser
        # Add the bboxes from network that haven't been merged
        for network_bbox in _network_bboxes:
            self.table_bbox_parses[network_bbox] = self.network_parser
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -1,27 +1,16 @@
 # -*- coding: utf-8 -*-
 import os
 import sys
 import copy
 import locale
 import logging
 import warnings
 import subprocess
 import numpy as np
 import pandas as pd
 from .base import BaseParser
 from ..core import Table
 from ..utils import (
    build_file_path_in_temp_dir,
    export_pdf_as_png,
    scale_image,
    scale_pdf,
    segments_in_bbox,
-    text_in_bbox,
+    text_in_bbox_per_axis,
    merge_close_lines,
    get_table_index,
    compute_accuracy,
    compute_whitespace,
 )
 from ..image_processing import (
    adaptive_threshold,
@ -31,9 +20,6 @@ from ..image_processing import (
 )
 logger = logging.getLogger("camelot")
 class Lattice(BaseParser):
    """Lattice method of parsing looks for lines between text
    to parse the table.
@ -79,7 +65,7 @@ class Lattice(BaseParser):
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
@ -95,39 +81,43 @@ class Lattice(BaseParser):
    """
    def __init__(
-        self,
+            self,
-        table_regions=None,
+            table_regions=None,
-        table_areas=None,
+            table_areas=None,
-        process_background=False,
+            process_background=False,
-        line_scale=15,
+            line_scale=15,
-        copy_text=None,
+            copy_text=None,
-        shift_text=["l", "t"],
+            shift_text=None,
-        split_text=False,
+            split_text=False,
-        flag_size=False,
+            flag_size=False,
-        strip_text="",
+            strip_text="",
-        line_tol=2,
+            line_tol=2,
-        joint_tol=2,
+            joint_tol=2,
-        threshold_blocksize=15,
+            threshold_blocksize=15,
-        threshold_constant=-2,
+            threshold_constant=-2,
-        iterations=0,
+            iterations=0,
-        resolution=300,
+            resolution=300,
-        **kwargs
+            **kwargs):
-    ):
+        super().__init__(
-        self.table_regions = table_regions
+            "lattice",
-        self.table_areas = table_areas
+            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            copy_text=copy_text,
            shift_text=shift_text or ["l", "t"],
            flag_size=flag_size,
        )
        self.process_background = process_background
        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text
        self.split_text = split_text
        self.flag_size = flag_size
        self.strip_text = strip_text
        self.line_tol = line_tol
        self.joint_tol = joint_tol
        self.threshold_blocksize = threshold_blocksize
        self.threshold_constant = threshold_constant
        self.iterations = iterations
        self.resolution = resolution
        self.image_path = None
        self.pdf_image = None
    @staticmethod
    def _reduce_index(t, idx, shift_text):
@ -174,51 +164,13 @@ class Lattice(BaseParser):
            indices.append((r_idx, c_idx, text))
        return indices
-    @staticmethod
+    def record_parse_metadata(self, table):
-    def _copy_spanning_text(t, copy_text=None):
+        """Record data about the origin of the table
        """Copies over text in empty spanning cells.
        Parameters
        ----------
        t : camelot.core.Table
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.
        Returns
        -------
        t : camelot.core.Table
        """
-        for f in copy_text:
+        super().record_parse_metadata(table)
-            if f == "h":
+        # for plotting
-                for i in range(len(t.cells)):
+        table._image = self.pdf_image  # Reuse the image used for calc
-                    for j in range(len(t.cells[i])):
+        table._segments = (self.vertical_segments, self.horizontal_segments)
                        if t.cells[i][j].text.strip() == "":
                            if t.cells[i][j].hspan and not t.cells[i][j].left:
                                t.cells[i][j].text = t.cells[i][j - 1].text
            elif f == "v":
                for i in range(len(t.cells)):
                    for j in range(len(t.cells[i])):
                        if t.cells[i][j].text.strip() == "":
                            if t.cells[i][j].vspan and not t.cells[i][j].top:
                                t.cells[i][j].text = t.cells[i - 1][j].text
        return t
    def _generate_image(self):
        from ..ext.ghostscript import Ghostscript
        self.imagename = "".join([self.rootname, ".png"])
        gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
            self.imagename, self.filename
        )
        gs_call = gs_call.encode().split()
        null = open(os.devnull, "wb")
        with Ghostscript(*gs_call, stdout=null) as gs:
            pass
        null.close()
    def _generate_table_bbox(self):
        def scale_areas(areas):
@ -233,20 +185,26 @@ class Lattice(BaseParser):
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas
-        self.image, self.threshold = adaptive_threshold(
+        self.image_path = build_file_path_in_temp_dir(
-            self.imagename,
+            os.path.basename(self.filename),
            ".png"
        )
        export_pdf_as_png(self.filename, self.image_path, self.resolution)
        self.pdf_image, self.threshold = adaptive_threshold(
            self.image_path,
            process_background=self.process_background,
            blocksize=self.threshold_blocksize,
            c=self.threshold_constant,
        )
-        image_width = self.image.shape[1]
+        image_width = self.pdf_image.shape[1]
-        image_height = self.image.shape[0]
+        image_height = self.pdf_image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
        image_height_scaler = image_height / float(self.pdf_height)
        pdf_width_scaler = self.pdf_width / float(image_width)
        pdf_height_scaler = self.pdf_height / float(image_height)
-        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
+        image_scalers = (image_width_scaler,
                         image_height_scaler, self.pdf_height)
        pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
        if self.table_areas is None:
@ -288,46 +246,88 @@ class Lattice(BaseParser):
            areas = scale_areas(self.table_areas)
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
-        self.table_bbox_unscaled = copy.deepcopy(table_bbox)
+        [
-
+            self.table_bbox_parses,
-        self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
+            self.vertical_segments,
            self.horizontal_segments
        ] = scale_image(
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )
-    def _generate_columns_and_rows(self, table_idx, tk):
+        for bbox, parse in self.table_bbox_parses.items():
            joints = parse["joints"]
            # Merge x coordinates that are close together
            line_tol = self.line_tol
            # Sort the joints, make them a list of lists (instead of sets)
            joints_normalized = list(
                map(
                    lambda x: list(x),
                    sorted(joints, key=lambda j: - j[0])
                )
            )
            for idx in range(1, len(joints_normalized)):
                x_left, x_right = \
                    joints_normalized[idx-1][0], joints_normalized[idx][0]
                if x_left - line_tol <= x_right <= x_left + line_tol:
                    joints_normalized[idx][0] = x_left
            # Merge y coordinates that are close together
            joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
            for idx in range(1, len(joints_normalized)):
                y_bottom, y_top = \
                    joints_normalized[idx-1][1], joints_normalized[idx][1]
                if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
                    joints_normalized[idx][1] = y_bottom
            # FRHTODO: check this is useful, otherwise get rid of the code
            # above
            parse["joints_normalized"] = joints_normalized
            cols = list(map(lambda coords: coords[0], joints))
            cols.extend([bbox[0], bbox[2]])
            rows = list(map(lambda coords: coords[1], joints))
            rows.extend([bbox[1], bbox[3]])
            # sort horizontal and vertical segments
            cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
            rows = merge_close_lines(
                sorted(rows, reverse=True),
                line_tol=self.line_tol
            )
            parse["col_anchors"] = cols
            parse["row_anchors"] = rows
    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        t_bbox = {}
        v_s, h_s = segments_in_bbox(
-            tk, self.vertical_segments, self.horizontal_segments
+            bbox, self.vertical_segments, self.horizontal_segments
        )
-        t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
+        self.t_bbox = text_in_bbox_per_axis(
-        t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
+            bbox,
            self.horizontal_text,
            self.vertical_text
            )
        parse = self.table_bbox_parses[bbox]
        t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
        t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
        self.t_bbox = t_bbox
        cols, rows = zip(*self.table_bbox[tk])
        cols, rows = list(cols), list(rows)
        cols.extend([tk[0], tk[2]])
        rows.extend([tk[1], tk[3]])
        # sort horizontal and vertical segments
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
        rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
        # make grid using x and y coord of shortlisted rows and cols
-        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        cols = [
-        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+            (parse["col_anchors"][i], parse["col_anchors"][i + 1])
-
+            for i in range(0, len(parse["col_anchors"]) - 1)
        ]
        rows = [
            (parse["row_anchors"][i], parse["row_anchors"][i + 1])
            for i in range(0, len(parse["row_anchors"]) - 1)
        ]
        return cols, rows, v_s, h_s
-    def _generate_table(self, table_idx, cols, rows, **kwargs):
+    def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
        v_s = kwargs.get("v_s")
        h_s = kwargs.get("h_s")
        if v_s is None or h_s is None:
            raise ValueError("No segments found on {}".format(self.rootname))
-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, bbox, cols, rows)
        # set table edges to True using ver+hor lines
        table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
        # set table border edges to True
@ -335,81 +335,5 @@ class Lattice(BaseParser):
        # set spanning cells to True
        table = table.set_span()
-        pos_errors = []
+        self.record_parse_metadata(table)
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    indices = Lattice._reduce_index(
                        table, indices, shift_text=self.shift_text
                    )
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
            table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        whitespace = compute_whitespace(data)
        table.flavor = "lattice"
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))
        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
        table._image = (self.image, self.table_bbox_unscaled)
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None
        return table
    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
        self._generate_layout(filename, layout_kwargs)
        if not suppress_stdout:
            logger.info("Processing {}".format(os.path.basename(self.rootname)))
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
                    "{} is image-based, camelot only works on"
                    " text-based pages.".format(os.path.basename(self.rootname))
                )
            else:
                warnings.warn(
                    "No tables found on {}".format(os.path.basename(self.rootname))
                )
            return []
        self._generate_image()
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
            table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = tk
            _tables.append(table)
        return _tables
--- a/camelot/parsers/network.py
+++ b/camelot/parsers/network.py
@ -0,0 +1,726 @@
 # -*- coding: utf-8 -*-
 """Implementation of network table parser."""
 from __future__ import division
 import copy
 import math
 import numpy as np
 from .base import TextBaseParser
 from ..core import (
    TextAlignments,
    ALL_ALIGNMENTS,
    HORIZONTAL_ALIGNMENTS,
    VERTICAL_ALIGNMENTS
 )
 from ..utils import (
    bbox_from_str,
    text_in_bbox,
    textlines_overlapping_bbox,
    bbox_from_textlines,
    find_columns_boundaries,
    boundaries_to_split_lines,
    text_in_bbox_per_axis,
 )
 # maximum number of columns over which a header can spread
 MAX_COL_SPREAD_IN_HEADER = 3
 # Minimum number of textlines in a table
 MINIMUM_TEXTLINES_IN_TABLE = 6
 def column_spread(left, right, col_anchors):
    """Get the number of columns crossed by a segment [left, right]."""
    index_left = 0
    while index_left < len(col_anchors) \
            and col_anchors[index_left] < left:
        index_left += 1
    index_right = index_left
    while index_right < len(col_anchors) \
            and col_anchors[index_right] < right:
        index_right += 1
    return index_right - index_left
 def find_closest_tls(bbox, tls):
    """ Search for tls that are the closest but outside in all 4 directions
    """
    left, right, top, bottom = None, None, None, None
    (bbox_left, bbox_bottom, bbox_right, bbox_top) = bbox
    for textline in tls:
        if textline.x1 < bbox_left:
            # Left: check it overlaps horizontally
            if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
                continue
            if left is None or left.x1 < textline.x1:
                left = textline
        elif bbox_right < textline.x0:
            # Right: check it overlaps horizontally
            if textline.y0 > bbox_top or textline.y1 < bbox_bottom:
                continue
            if right is None or right.x0 > textline.x0:
                right = textline
        else:
            # Either bottom or top: must overlap vertically
            if textline.x0 > bbox_right or textline.x1 < bbox_left:
                continue
            if textline.y1 < bbox_bottom:
                # Bottom
                if bottom is None or bottom.y1 < textline.y1:
                    bottom = textline
            elif bbox_top < textline.y0:
                # Top
                if top is None or top.y0 > textline.y0:
                    top = textline
    return {
        "left": left,
        "right": right,
        "top": top,
        "bottom": bottom,
    }
 def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
    """Expand a bbox vertically up by looking for plausible headers.
    The core algorithm is based on fairly strict alignment of text. It works
    for the table body, but might fail on tables' headers since they tend to be
    in a different font, alignment (e.g. vertical), etc.
    This method evalutes the area above the table body's bbox for
    characteristics of a table header: close to the top of the body, with cells
    that fit within the horizontal bounds identified.
    """
    new_bbox = body_bbox
    (left, bottom, right, top) = body_bbox
    zones = []
    keep_searching = True
    while keep_searching:
        keep_searching = False
        # a/ first look for the closest text element above the bbox.
        # It will be the anchor for a possible new row.
        closest_above = None
        all_above = []
        for textline in textlines:
            # higher than the table, >50% within its bounds
            textline_center = 0.5 * (textline.x0 + textline.x1)
            if textline.y0 > top and left < textline_center < right:
                all_above.append(textline)
                if closest_above is None or closest_above.y0 > textline.y0:
                    closest_above = textline
        if closest_above and closest_above.y0 < top + max_v_gap:
            # b/ We have a candidate cell that is within the correct
            # vertical band, and directly above the table. Starting from
            # this anchor, we list all the textlines within the same row.
            tls_in_new_row = []
            top = closest_above.y1
            pushed_up = True
            while pushed_up:
                pushed_up = False
                # Iterate and extract elements that fit in the row
                # from our list
                for i in range(len(all_above) - 1, -1, -1):
                    textline = all_above[i]
                    if textline.y0 < top:
                        # The bottom of this element is within our row
                        # so we add it.
                        tls_in_new_row.append(textline)
                        all_above.pop(i)
                        if textline.y1 > top:
                            # If the top of this element raises our row's
                            # band, we'll need to keep on searching for
                            # overlapping items
                            top = textline.y1
                            pushed_up = True
            # Get the x-ranges for all the textlines, and merge the
            # x-ranges that overlap
            zones = zones + list(
                map(
                    lambda textline: [textline.x0, textline.x1],
                    tls_in_new_row
                )
            )
            zones.sort(key=lambda z: z[0])  # Sort by left coordinate
            # Starting from the right, if two zones overlap horizontally,
            # merge them
            merged_something = True
            while merged_something:
                merged_something = False
                for i in range(len(zones) - 1, 0, -1):
                    zone_right = zones[i]
                    zone_left = zones[i-1]
                    if zone_left[1] >= zone_right[0]:
                        zone_left[1] = max(zone_right[1], zone_left[1])
                        zones.pop(i)
                        merged_something = True
            max_spread = max(
                list(
                    map(
                        lambda zone: column_spread(
                            zone[0], zone[1], col_anchors),
                        zones
                    )
                )
            )
            # Accept textlines that cross columns boundaries, as long as they
            # cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
            # columns.
            # This is to avoid picking unrelated paragraphs.
            if max_spread <= min(
                    MAX_COL_SPREAD_IN_HEADER,
                    math.ceil(len(col_anchors) / 2)):
                # Combined, the elements we've identified don't cross more
                # than the authorized number of columns.
                # We're trying to avoid
                # 0: <BAD: Added header spans too broad>
                # 1: <A1>    <B1>    <C1>    <D1>    <E1>
                # 2: <A2>    <B2>    <C2>    <D2>    <E2>
                # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
                new_bbox = (left, bottom, right, top)
                # At this stage we've identified a plausible row (or the
                # beginning of one).
                keep_searching = True
    return new_bbox
 class AlignmentCounter():
    """
    For a given textline, represent all other textlines aligned with it.
    A textline can be vertically aligned with others if their bbox match on
    left, right, or middle coord, and horizontally aligned if they match top,
    bottom, or center coord.
    """
    def __init__(self):
        self.alignment_to_occurrences = {}
        for alignment in ALL_ALIGNMENTS:
            self.alignment_to_occurrences[alignment] = []
    def __getitem__(self, key):
        return self.alignment_to_occurrences[key]
    def __setitem__(self, key, value):
        self.alignment_to_occurrences[key] = value
        return value
    def max_alignments(self, alignment_ids=None):
        """Get the alignment dimension with the max number of textlines.
        """
        alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
        alignment_items = map(
            lambda alignment_id: (
                alignment_id,
                self.alignment_to_occurrences[alignment_id]
            ),
            alignment_ids
        )
        return max(alignment_items, key=lambda item: len(item[1]))
    def max_v(self):
        """Tuple (alignment_id, textlines) of largest vertical row.
        """
        # Note that the horizontal alignments (left, center, right) are aligned
        # vertically in a column, so max_v is calculated by looking at
        # horizontal alignments.
        return self.max_alignments(HORIZONTAL_ALIGNMENTS)
    def max_h(self):
        """Tuple (alignment_id, textlines) of largest horizontal col.
        """
        return self.max_alignments(VERTICAL_ALIGNMENTS)
    def max_v_count(self):
        """Returns the maximum number of alignments along
        one of the vertical axis (left/right/middle).
        """
        return len(self.max_v()[1])
    def max_h_count(self):
        """Returns the maximum number of alignments along
        one of the horizontal axis (bottom/top/center).
        """
        return len(self.max_h()[1])
    def alignment_score(self):
        """We define the alignment score of a textline as the product of the
        number of aligned elements - 1. The -1 is to avoid favoring
         singletons on a long line.
        """
        return (self.max_v_count()-1) * (self.max_h_count()-1)
 class TextNetworks(TextAlignments):
    """Text elements connected by vertical AND horizontal alignments.
    The alignment dict has six keys based on the hor/vert alignments,
    and each key's value is a list of camelot.core.TextAlignment objects.
    """
    def __init__(self):
        super().__init__(ALL_ALIGNMENTS)
        # For each textline, dictionary "alignment type" to
        # "number of textlines aligned"
        self._textline_to_alignments = {}
    def _update_alignment(self, alignment, coord, textline):
        alignment.register_aligned_textline(textline, coord)
    def _register_all_text_lines(self, textlines):
        """Add all textlines to our network repository to
        identify alignments.
        """
        # Identify all the alignments
        for textline in textlines:
            if len(textline.get_text().strip()) > 0:
                self._register_textline(textline)
    def _compute_alignment_counts(self):
        """Build a dictionary textline -> alignment object.
        """
        for align_id, textedges in self._text_alignments.items():
            for textedge in textedges:
                for textline in textedge.textlines:
                    alignments = self._textline_to_alignments.get(
                        textline, None)
                    if alignments is None:
                        alignments = AlignmentCounter()
                        self._textline_to_alignments[textline] = alignments
                    alignments[align_id] = textedge.textlines
    def remove_unconnected_edges(self):
        """Weed out elements which are only connected to others vertically
        or horizontally. There needs to be connections across both
        dimensions.
        """
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
            for text_alignments in self._text_alignments.values():
                # For each alignment edge, remove items if they are singletons
                # either horizontally or vertically
                for text_alignment in text_alignments:
                    for i in range(len(text_alignment.textlines) - 1, -1, -1):
                        textline = text_alignment.textlines[i]
                        alignments = self._textline_to_alignments[textline]
                        if alignments.max_h_count() <= 1 or \
                           alignments.max_v_count() <= 1:
                            del text_alignment.textlines[i]
                            removed_singletons = True
            self._textline_to_alignments = {}
            self._compute_alignment_counts()
    def most_connected_textline(self):
        """ Retrieve the textline that is most connected across vertical and
        horizontal axis.
        """
        # Find the textline with the highest alignment score, with a tie break
        # to prefer textlines further down in the table.  Starting the search
        # from the table's bottom allows the algo to collect data on more cells
        # before going to the header, typically harder to parse.
        return max(
            self._textline_to_alignments.keys(),
            key=lambda textline:
            (
                self._textline_to_alignments[textline].alignment_score(),
                -textline.y0, -textline.x0
            ),
            default=None
        )
    def compute_plausible_gaps(self):
        """ Evaluate plausible gaps between cells horizontally and vertically
        based on the textlines aligned with the most connected textline.
        Returns
        -------
        gaps_hv : tuple
            (horizontal_gap, horizontal_gap) in pdf coordinate space.
        """
        # Determine the textline that has the most combined
        # alignments across horizontal and vertical axis.
        # It will serve as a reference axis along which to collect the average
        # spacing between rows/cols.
        most_aligned_tl = self.most_connected_textline()
        if most_aligned_tl is None:
            return None
        # Retrieve the list of textlines it's aligned with, across both
        # axis
        best_alignment = self._textline_to_alignments[most_aligned_tl]
        __, ref_h_textlines = best_alignment.max_h()
        __, ref_v_textlines = best_alignment.max_v()
        if len(ref_v_textlines) <= 1 or len(ref_h_textlines) <= 1:
            return None
        h_textlines = sorted(
            ref_h_textlines,
            key=lambda textline: textline.x0,
            reverse=True
        )
        v_textlines = sorted(
            ref_v_textlines,
            key=lambda textline: textline.y0,
            reverse=True
        )
        h_gaps, v_gaps = [], []
        for i in range(1, len(v_textlines)):
            v_gaps.append(v_textlines[i-1].y0 - v_textlines[i].y0)
        for i in range(1, len(h_textlines)):
            h_gaps.append(h_textlines[i-1].x0 - h_textlines[i].x0)
        if (not h_gaps or not v_gaps):
            return None
        percentile = 75
        gaps_hv = (
            2.0 * np.percentile(h_gaps, percentile),
            2.0 * np.percentile(v_gaps, percentile)
        )
        return gaps_hv
    def search_table_body(self, gaps_hv, parse_details=None):
        """ Build a candidate bbox for the body of a table using network algo
        Seed the process with the textline with the highest alignment
        score, then expand the bbox with textlines within threshold.
        Parameters
        ----------
        gaps_hv : tuple
            The maximum distance allowed to consider surrounding lines/columns
            as part of the same table.
        parse_details : array (optional)
            Optional parameter array, in which to store extra information
            to help later visualization of the table creation.
        """
        # First, determine the textline that has the most combined
        # alignments across horizontal and vertical axis.
        # It will serve both as a starting point for the table boundary
        # search, and as a way to estimate the average spacing between
        # rows/cols.
        most_aligned_tl = self.most_connected_textline()
        # Calculate the 75th percentile of the horizontal/vertical
        # gaps between textlines.  Use this as a reference for a threshold
        # to not exceed while looking for table boundaries.
        max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
        if parse_details is not None:
            # Store debug info
            parse_details_search = {
                "max_h_gap": max_h_gap,
                "max_v_gap": max_v_gap,
                "iterations": []
            }
            parse_details.append(parse_details_search)
        else:
            parse_details_search = None
        bbox = [most_aligned_tl.x0, most_aligned_tl.y0,
                most_aligned_tl.x1, most_aligned_tl.y1]
        # For the body of the table, we only consider cells that have
        # alignments on both axis.
        tls_search_space = list(self._textline_to_alignments.keys())
        # tls_search_space = []
        tls_search_space.remove(most_aligned_tl)
        tls_in_bbox = [most_aligned_tl]
        last_bbox = None
        last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
        while last_bbox != bbox:
            if parse_details_search is not None:
                # Store debug info
                parse_details_search["iterations"].append(bbox)
            # Check that the closest tls are within the gaps allowed
            last_bbox = bbox
            cand_bbox = last_bbox.copy()
            closest_tls = find_closest_tls(bbox, tls_search_space)
            for direction, textline in closest_tls.items():
                if textline is None:
                    continue
                expanded_cand_bbox = cand_bbox.copy()
                if direction == "left":
                    if expanded_cand_bbox[0] - textline.x1 > gaps_hv[0]:
                        continue
                    expanded_cand_bbox[0] = textline.x0
                elif direction == "right":
                    if textline.x0 - expanded_cand_bbox[2] > gaps_hv[0]:
                        continue
                    expanded_cand_bbox[2] = textline.x1
                elif direction == "bottom":
                    if expanded_cand_bbox[1] - textline.y1 > gaps_hv[1]:
                        continue
                    expanded_cand_bbox[1] = textline.y0
                elif direction == "top":
                    if textline.y0 - expanded_cand_bbox[3] > gaps_hv[1]:
                        continue
                    expanded_cand_bbox[3] = textline.y1
                # If they are, see what an expanded bbox in that direction
                # would contain
                new_tls = text_in_bbox(expanded_cand_bbox, tls_search_space)
                tls_in_new_box = new_tls + tls_in_bbox
                # And if we're expanding up or down, check that the addition
                # of the new row won't reduce the number of columns.
                # This happens when text covers multiple rows - that's only
                # allowed in the header, treated separately.
                cols_bounds = find_columns_boundaries(tls_in_new_box)
                if direction in ["bottom", "top"] and \
                        len(cols_bounds) < len(last_cols_bounds):
                    continue
                # We have an expansion candidate: register it, update the
                # search space and repeat
                # We use bbox_from_textlines instead of cand_bbox in case some
                # overlapping textlines require a large bbox for strict fit.
                bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
                last_cols_bounds = cols_bounds
                tls_in_bbox.extend(new_tls)
                for i in range(len(tls_search_space) - 1, -1, -1):
                    textline = tls_search_space[i]
                    if textline in new_tls:
                        del tls_search_space[i]
        if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
            return bbox
        return None
    def generate(self, textlines):
        """Generate the text edge dictionaries based on the
        input textlines.
        """
        self._register_all_text_lines(textlines)
        self._compute_alignment_counts()
 class Network(TextBaseParser):
    """Network method of parsing looks for spaces between text
    to parse the table.
    If you want to specify columns when specifying multiple table
    areas, make sure that the length of both lists are equal.
    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    columns : list, optional (default: None)
        List of column x-coordinates strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    edge_tol : int, optional (default: 50)
        Tolerance parameter for extending textedges vertically.
    row_tol : int, optional (default: 2)
        Tolerance parameter used to combine text vertically,
        to generate rows.
    column_tol : int, optional (default: 0)
        Tolerance parameter used to combine text horizontally,
        to generate columns.
    """
    def __init__(
            self,
            table_regions=None,
            table_areas=None,
            columns=None,
            flag_size=False,
            split_text=False,
            strip_text="",
            edge_tol=None,
            row_tol=2,
            column_tol=0,
            debug=False,
            **kwargs):
        super().__init__(
            "network",
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
    def _generate_table_bbox(self):
        user_provided_bboxes = None
        if self.table_areas is not None:
            # User gave us table areas already.  We will use their coordinates
            # to find column anchors.
            user_provided_bboxes = []
            for area_str in self.table_areas:
                user_provided_bboxes.append(bbox_from_str(area_str))
        # Take all the textlines that are not just spaces
        all_textlines = [
            t for t in self.horizontal_text + self.vertical_text
            if len(t.get_text().strip()) > 0
        ]
        textlines = self._apply_regions_filter(all_textlines)
        textlines_processed = {}
        self.table_bbox_parses = {}
        if self.parse_details is not None:
            parse_details_network_searches = []
            self.parse_details["network_searches"] = \
                parse_details_network_searches
            parse_details_bbox_searches = []
            self.parse_details["bbox_searches"] = parse_details_bbox_searches
            self.parse_details["col_searches"] = []
        else:
            parse_details_network_searches = None
            parse_details_bbox_searches = None
        while True:
            # Find a bbox: either pulling from the user's or from the network
            # algorithm.
            # First look for the body of the table
            bbox_body = None
            if user_provided_bboxes is not None:
                if len(user_provided_bboxes) > 0:
                    bbox_body = user_provided_bboxes.pop()
            else:
                text_network = TextNetworks()
                text_network.generate(textlines)
                text_network.remove_unconnected_edges()
                gaps_hv = text_network.compute_plausible_gaps()
                if gaps_hv is None:
                    return None
                # edge_tol instructions override the calculated vertical gap
                edge_tol_hv = (
                    gaps_hv[0],
                    gaps_hv[1] if self.edge_tol is None else self.edge_tol
                )
                bbox_body = text_network.search_table_body(
                    edge_tol_hv,
                    parse_details=parse_details_bbox_searches
                )
                if parse_details_network_searches is not None:
                    # Preserve the current edge calculation for debugging
                    parse_details_network_searches.append(
                        copy.deepcopy(text_network)
                    )
            if bbox_body is None:
                break
            # Get all the textlines that overlap with the box, compute
            # columns
            tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
            cols_boundaries = find_columns_boundaries(tls_in_bbox)
            cols_anchors = boundaries_to_split_lines(cols_boundaries)
            # Unless the user gave us strict bbox_body, try to find a header
            # above the body to build the full bbox.
            if user_provided_bboxes is not None:
                bbox_full = bbox_body
            else:
                # Expand the text box to fully contain the tls we found
                bbox_body = bbox_from_textlines(tls_in_bbox)
                # Apply a heuristic to salvage headers which formatting might
                # be off compared to the rest of the table.
                bbox_full = search_header_from_body_bbox(
                    bbox_body,
                    textlines,
                    cols_anchors,
                    gaps_hv[1]
                )
            table_parse = {
                "bbox_body": bbox_body,
                "cols_boundaries": cols_boundaries,
                "cols_anchors": cols_anchors,
                "bbox_full": bbox_full
            }
            self.table_bbox_parses[bbox_full] = table_parse
            if self.parse_details is not None:
                self.parse_details["col_searches"].append(table_parse)
            # Remember what textlines we processed, and repeat
            for textline in tls_in_bbox:
                textlines_processed[textline] = None
            textlines = list(filter(
                lambda textline: textline not in textlines_processed,
                textlines
            ))
    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        all_tls = list(
            sorted(
                filter(
                    lambda textline: len(textline.get_text().strip()) > 0,
                    self.t_bbox["horizontal"] + self.t_bbox["vertical"]
                ),
                key=lambda textline: (-textline.y0, textline.x0)
            )
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            all_tls
        )
        # FRHTODO:
        # This algorithm takes the horizontal textlines in the bbox, and groups
        # them into rows based on their bottom y0.
        # That's wrong: it misses the vertical items, and misses out on all
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        if user_cols is not None:
            cols = [text_x_min] + user_cols + [text_x_max]
            cols = [
                (cols[i], cols[i + 1])
                for i in range(0, len(cols) - 1)
            ]
        else:
            parse_details = self.table_bbox_parses[bbox]
            col_anchors = parse_details["cols_anchors"]
            cols = list(map(
                lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
                range(0, len(col_anchors) - 1)
            ))
        return cols, rows, None, None
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -1,21 +1,18 @@
 # -*- coding: utf-8 -*-
 import os
 import logging
 import warnings
-import numpy as np
+from .base import TextBaseParser
-import pandas as pd
+from ..core import TextEdges
-
+from ..utils import (
-from .base import BaseParser
+    bbox_from_str,
-from ..core import TextEdges, Table
+    bbox_from_textlines,
-from ..utils import text_in_bbox, get_table_index, compute_accuracy, compute_whitespace
+    text_in_bbox,
    text_in_bbox_per_axis,
 )
-logger = logging.getLogger("camelot")
+class Stream(TextBaseParser):
 class Stream(BaseParser):
    """Stream method of parsing looks for spaces between text
    to parse the table.
@ -55,218 +52,35 @@ class Stream(BaseParser):
    """
    def __init__(
-        self,
+            self,
-        table_regions=None,
+            table_regions=None,
-        table_areas=None,
+            table_areas=None,
-        columns=None,
+            columns=None,
-        split_text=False,
+            flag_size=False,
-        flag_size=False,
+            split_text=False,
-        strip_text="",
+            strip_text="",
-        edge_tol=50,
+            edge_tol=50,
-        row_tol=2,
+            row_tol=2,
-        column_tol=0,
+            column_tol=0,
-        **kwargs
+            **kwargs):
-    ):
+        super().__init__(
-        self.table_regions = table_regions
+            "stream",
-        self.table_areas = table_areas
+            table_regions=table_regions,
-        self.columns = columns
+            table_areas=table_areas,
-        self._validate_columns()
+            columns=columns,
-        self.split_text = split_text
+            flag_size=flag_size,
-        self.flag_size = flag_size
+            split_text=split_text,
-        self.strip_text = strip_text
+            strip_text=strip_text,
-        self.edge_tol = edge_tol
+            edge_tol=edge_tol,
-        self.row_tol = row_tol
+            row_tol=row_tol,
-        self.column_tol = column_tol
+            column_tol=column_tol,
-
+        )
-    @staticmethod
+        self.textedges = []
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.
        Returns
        -------
        text_bbox : tuple
            Tuple (x0, y0, x1, y1) in pdf coordinate space.
        """
        xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
        ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
        xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
        ymax = max([t.y1 for direction in t_bbox for t in t_bbox[direction]])
        text_bbox = (xmin, ymin, xmax, ymax)
        return text_bbox
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = 0
        rows = []
        temp = []
        for t in text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright for obj in t._objs if
            # type(obj) is LTChar]):
            if t.get_text().strip():
                if not np.isclose(row_y, t.y0, atol=row_tol):
                    rows.append(sorted(temp, key=lambda t: t.x0))
                    temp = []
                    row_y = t.y0
                temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        __ = rows.pop(0)  # TODO: hacky
        return rows
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if column_tol >= 0:
                    if higher[0] <= lower[1] or np.isclose(
                        higher[0], lower[1], atol=column_tol
                    ):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif column_tol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1], atol=abs(column_tol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_mids = [
            sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) if len(r) > 0 else 0
            for r in rows_grouped
        ]
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
        rows.insert(0, text_y_max)
        rows.append(text_y_min)
        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
        return rows
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = Stream._group_rows(text, row_tol=row_tol)
            elements = [len(r) for r in text]
            new_cols = [
                (t.x0, t.x1) for r in text if len(r) == max(elements) for t in r
            ]
            cols.extend(Stream._merge_columns(sorted(new_cols)))
        return cols
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns" " should be equal")
    def _nurminen_table_detection(self, textlines):
        """A general implementation of the table detection algorithm
        described by Anssi Nurminen's master's thesis.
-        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
        Assumes that tables are situated relatively far apart
        vertically.
@ -283,65 +97,59 @@ class Stream(BaseParser):
        # guess table areas using textlines and relevant edges
        table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
        # treat whole page as table area if no table areas found
-        if not len(table_bbox):
+        if not table_bbox:
            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
        return table_bbox
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        table._textedges = self.textedges
    def _generate_table_bbox(self):
        self.textedges = []
        if self.table_areas is None:
            hor_text = self.horizontal_text
            if self.table_regions is not None:
                # filter horizontal text
                hor_text = []
-                for region in self.table_regions:
+                for region_str in self.table_regions:
-                    x1, y1, x2, y2 = region.split(",")
+                    region_text = text_in_bbox(
-                    x1 = float(x1)
+                        bbox_from_str(region_str),
-                    y1 = float(y1)
+                        self.horizontal_text)
                    x2 = float(x2)
                    y2 = float(y2)
                    region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
                    hor_text.extend(region_text)
            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(hor_text)
+            table_bbox_parses = self._nurminen_table_detection(hor_text)
        else:
-            table_bbox = {}
+            table_bbox_parses = {}
-            for area in self.table_areas:
+            for area_str in self.table_areas:
-                x1, y1, x2, y2 = area.split(",")
+                table_bbox_parses[bbox_from_str(area_str)] = None
-                x1 = float(x1)
+        self.table_bbox_parses = table_bbox_parses
                y1 = float(y1)
                x2 = float(x2)
                y2 = float(y2)
                table_bbox[(x1, y2, x2, y1)] = None
        self.table_bbox = table_bbox
-    def _generate_columns_and_rows(self, table_idx, tk):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
-        t_bbox = {}
+        self.t_bbox = text_in_bbox_per_axis(
-        t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
+            bbox,
-        t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
+            self.horizontal_text,
            self.vertical_text
        )
-        t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
-        t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
-        self.t_bbox = t_bbox
+        rows_grouped = self._group_rows(
-
+            self.t_bbox["horizontal"], row_tol=self.row_tol)
        text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox)
        rows_grouped = self._group_rows(self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
-        if self.columns is not None and self.columns[table_idx] != "":
+        if user_cols is not None:
-            # user has to input boundary columns too
+            cols = [text_x_min] + user_cols + [text_x_max]
-            # take (0, pdf_width) by default
+            cols = [
-            # similar to else condition
+                (cols[i], cols[i + 1])
-            # len can't be 1
+                for i in range(0, len(cols) - 1)
-            cols = self.columns[table_idx].split(",")
+            ]
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
@ -353,14 +161,22 @@ class Stream(BaseParser):
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
-                if len(elements):
+                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
-                        f"No tables found in table area {table_idx + 1}"
+                        f"No tables found in table area {bbox}"
                    )
-            cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
+            cols = [
-            cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
+                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
@ -383,80 +199,4 @@ class Stream(BaseParser):
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
-        return cols, rows
+        return cols, rows, None, None
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = Table(cols, rows)
        table = table.set_all_edges()
        pos_errors = []
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])
        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        whitespace = compute_whitespace(data)
        table.flavor = "stream"
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
        table.page = int(os.path.basename(self.rootname).replace("page-", ""))
        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
        table._image = None
        table._segments = None
        table._textedges = self.textedges
        return table
    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
        self._generate_layout(filename, layout_kwargs)
        base_filename = os.path.basename(self.rootname)
        if not suppress_stdout:
            logger.info(f"Processing {base_filename}")
        if not self.horizontal_text:
            if self.images:
                warnings.warn(
                    f"{base_filename} is image-based, camelot only works on"
                    " text-based pages."
                )
            else:
                warnings.warn(f"No tables found on {base_filename}")
            return []
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows = self._generate_columns_and_rows(table_idx, tk)
            table = self._generate_table(table_idx, cols, rows)
            table._bbox = tk
            _tables.append(table)
        return _tables
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -8,9 +8,164 @@ except ImportError:
 else:
    _HAS_MPL = True
 from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
-class PlotMethods(object):
+from pdfminer.layout import (
-    def __call__(self, table, kind="text", filename=None):
+    LTTextLineVertical,
 )
 def extend_axe_lim(ax, bbox, margin=10):
    """Ensure the ax limits include the input bbox
    """
    x0, x1 = ax.get_xlim()
    y0, y1 = ax.get_ylim()
    ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
    ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
 def draw_labeled_bbox(
    ax, bbox, text,
    color="black", linewidth=3,
    linestyle="solid",
    label_pos="top,left",
    fontsize=12,
 ):
    """Utility drawing function to draw a box with an associated text label
    """
    ax.add_patch(
        patches.Rectangle(
            (bbox[0], bbox[1]),
            bbox[2] - bbox[0], bbox[3] - bbox[1],
            color=color,
            linewidth=linewidth, linestyle=linestyle,
            fill=False
        )
    )
    vlabel, hlabel = label_pos.split(",")
    if vlabel == "top":
        y = max(bbox[1], bbox[3])
    elif vlabel == "bottom":
        y = min(bbox[1], bbox[3])
    else:
        y = 0.5 * (bbox[1] + bbox[3])
    # We want to draw the label outside the box (above or below)
    label_align_swap = {
        "top": "bottom",
        "bottom": "top",
        "center": "center"
    }
    vlabel_out_of_box = label_align_swap[vlabel]
    if hlabel == "right":
        x = max(bbox[0], bbox[2])
    elif hlabel == "left":
        x = min(bbox[0], bbox[2])
    else:
        x = 0.5 * (bbox[0] + bbox[2])
    ax.text(
        x, y,
        text,
        fontsize=fontsize, color="black",
        verticalalignment=vlabel_out_of_box,
        horizontalalignment=hlabel,
        bbox=dict(facecolor=color, alpha=0.1)
    )
 def draw_pdf(table, ax):
    """Draw the content of the table's source pdf into the passed subplot
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    """
    img = table.get_pdf_image()
    ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
 def draw_parse_constraints(table, ax):
    """Draw any user provided constraints (area, region, columns, etc)
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    """
    if table.parse_details:
        zone_constraints = {
            "region": "table_regions",
            "area": "table_areas",
        }
        for zone_name, zone_id in zone_constraints.items():
            # Display a bbox per region / area
            for zone_str in table.parse_details[zone_id] or []:
                draw_labeled_bbox(
                    ax, bbox_from_str(zone_str),
                    "{zone_name}: ({zone_str})".format(
                        zone_name=zone_name,
                        zone_str=zone_str
                    ),
                    color="purple",
                    linestyle="dotted",
                    linewidth=1,
                    label_pos="bottom,right"
                )
 def draw_text(table, ax):
    """Draw text, horizontal in blue, vertical in red
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    """
    bbox = bbox_from_textlines(table.textlines)
    for t in table.textlines:
        color = "red" if isinstance(t, LTTextLineVertical) else "blue"
        ax.add_patch(
            patches.Rectangle(
                    (t.x0, t.y0),
                    t.x1 - t.x0,
                    t.y1 - t.y0,
                    color=color,
                    alpha=0.2
                )
            )
    extend_axe_lim(ax, bbox)
 def prepare_plot(table, ax=None):
    """Initialize plot and draw common components
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    Returns
    -------
    ax : matplotlib.axes.Axes
    """
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
    draw_pdf(table, ax)
    draw_parse_constraints(table, ax)
    return ax
 class PlotMethods():
    def __call__(self, table, kind="text", filename=None, ax=None):
        """Plot elements found on PDF page based on kind
        specified, useful for debugging and playing with different
        parameters to get the best output.
@ -20,7 +175,8 @@ class PlotMethods(object):
        table: camelot.core.Table
            A Camelot Table.
        kind : str, optional (default: 'text')
-            {'text', 'grid', 'contour', 'joint', 'line'}
+            {'text', 'grid', 'contour', 'joint', 'line',
                'network_table_search'}
            The element type for which a plot should be generated.
        filepath: str, optional (default: None)
            Absolute path for saving the generated plot.
@ -37,53 +193,49 @@ class PlotMethods(object):
            raise NotImplementedError(
                f"Lattice flavor does not support kind='{kind}'"
            )
-        elif table.flavor == "stream" and kind in ["joint", "line"]:
+        if table.flavor != "lattice" and kind in ["line"]:
            raise NotImplementedError(
-                f"Stream flavor does not support kind='{kind}'"
+                f"{table.flavor} flavor does not support kind='{kind}'"
            )
        plot_method = getattr(self, kind)
-        return plot_method(table)
+        return plot_method(table, ax)
-    def text(self, table):
+    @staticmethod
    def text(table, ax=None):
        """Generates a plot for all text elements present
        on the PDF page.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        fig = plt.figure()
+        ax = prepare_plot(table, ax)
-        ax = fig.add_subplot(111, aspect="equal")
+        draw_text(table, ax)
-        xs, ys = [], []
+        return ax.get_figure()
        for t in table._text:
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
            ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        return fig
-    def grid(self, table):
+    @staticmethod
    def grid(table, ax=None):
        """Generates a plot for the detected table grids
        on the PDF page.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        fig = plt.figure()
+        ax = prepare_plot(table, ax)
        ax = fig.add_subplot(111, aspect="equal")
        for row in table.cells:
            for cell in row:
                if cell.left:
@ -94,130 +246,247 @@ class PlotMethods(object):
                    ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
                if cell.bottom:
                    ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
-        return fig
+        return ax.get_figure()
-    def contour(self, table):
+    @staticmethod
    def contour(table, ax=None):
        """Generates a plot for all table boundaries present
        on the PDF page.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        try:
+        _FOR_LATTICE = table.flavor == "lattice"
-            img, table_bbox = table._image
+        ax = prepare_plot(table, ax)
            _FOR_LATTICE = True
        except TypeError:
            img, table_bbox = (None, {table._bbox: None})
            _FOR_LATTICE = False
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
        xs, ys = [], []
        if not _FOR_LATTICE:
-            for t in table._text:
+            draw_text(table, ax)
                xs.extend([t[0], t[2]])
                ys.extend([t[1], t[3]])
                ax.add_patch(
                    patches.Rectangle(
                        (t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue"
                    )
                )
-        for t in table_bbox.keys():
+        ax.add_patch(
-            ax.add_patch(
+            patches.Rectangle(
-                patches.Rectangle(
+                (table._bbox[0], table._bbox[1]),
-                    (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
+                table._bbox[2] - table._bbox[0],
-                )
+                table._bbox[3] - table._bbox[1],
                fill=False, color="red"
            )
-            if not _FOR_LATTICE:
+        )
-                xs.extend([t[0], t[2]])
+        if not _FOR_LATTICE:
-                ys.extend([t[1], t[3]])
+            extend_axe_lim(ax, table._bbox)
                ax.set_xlim(min(xs) - 10, max(xs) + 10)
                ax.set_ylim(min(ys) - 10, max(ys) + 10)
-        if _FOR_LATTICE:
+        return ax.get_figure()
            ax.imshow(img)
        return fig
-    def textedge(self, table):
+    @staticmethod
    def textedge(table, ax=None):
        """Generates a plot for relevant textedges.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        fig = plt.figure()
+        ax = prepare_plot(table, ax)
-        ax = fig.add_subplot(111, aspect="equal")
+        draw_text(table, ax)
        xs, ys = [], []
        for t in table._text:
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
            ax.add_patch(
                patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
            )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
-        for te in table._textedges:
+        if table.flavor == "network":
-            ax.plot([te.x, te.x], [te.y0, te.y1])
+            for network in table.parse_details["network_searches"]:
                most_connected_tl = network.most_connected_textline()
-        return fig
+                ax.add_patch(
                    patches.Rectangle(
                        (most_connected_tl.x0, most_connected_tl.y0),
                        most_connected_tl.x1 - most_connected_tl.x0,
                        most_connected_tl.y1 - most_connected_tl.y0,
                        color="red",
                        alpha=0.5
                    )
                )
                for tl in sorted(
                            network._textline_to_alignments.keys(),
                            key=lambda textline: (-textline.y0, textline.x0)
                        ):
                    alignments = network._textline_to_alignments[tl]
                    coords = get_textline_coords(tl)
                    alignment_id_h, tls_h = alignments.max_v()
                    alignment_id_v, tls_v = alignments.max_h()
                    xs = list(map(lambda tl: tl.x0, tls_v))
                    ys = list(map(lambda tl: tl.y1, tls_h))
                    top_h = max(ys)
                    ax.text(
                        coords[alignment_id_h],
                        top_h + 5,
                        "{max_h_count}".format(max_h_count=len(tls_h)),
                        verticalalignment="bottom",
                        horizontalalignment="center",
                        fontsize=8,
                        color="green"
                    )
                    ax.plot(
                        [coords[alignment_id_h]] * len(ys), ys,
                        color="green",
                        linestyle="solid",
                        linewidth=1,
                        marker="o",
                        markersize=3
                    )
-    def joint(self, table):
+                    left_v = min(map(lambda tl: tl.x0, tls_v))
                    ax.text(
                        left_v - 5,
                        coords[alignment_id_v],
                        "{max_v_count}".format(max_v_count=len(tls_v)),
                        verticalalignment="center",
                        horizontalalignment="right",
                        fontsize=8,
                        color="blue"
                    )
                    ax.plot(
                        xs, [coords[alignment_id_v]] * len(xs),
                        color="blue",
                        linestyle="solid",
                        linewidth=1,
                        marker="o",
                        markersize=3
                    )
        else:
            for te in table._textedges:
                ax.plot([te.coord, te.coord], [te.y0, te.y1])
        return ax.get_figure()
    @staticmethod
    def joint(table, ax=None):
        """Generates a plot for all line intersections present
        on the PDF page.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        img, table_bbox = table._image
+        ax = prepare_plot(table, ax)
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
        x_coord = []
        y_coord = []
-        for k in table_bbox.keys():
+        for coord in table.parse["joints"]:
-            for coord in table_bbox[k]:
+            x_coord.append(coord[0])
-                x_coord.append(coord[0])
+            y_coord.append(coord[1])
                y_coord.append(coord[1])
        ax.plot(x_coord, y_coord, "ro")
-        ax.imshow(img)
+        return ax.get_figure()
        return fig
-    def line(self, table):
+    @staticmethod
    def line(table, ax=None):
        """Generates a plot for all line segments present
        on the PDF page.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
-        fig = plt.figure()
+        ax = prepare_plot(table, ax)
        ax = fig.add_subplot(111, aspect="equal")
        vertical, horizontal = table._segments
        for v in vertical:
            ax.plot([v[0], v[2]], [v[1], v[3]])
        for h in horizontal:
            ax.plot([h[0], h[2]], [h[1], h[3]])
-        return fig
+        return ax.get_figure()
    @staticmethod
    def network_table_search(table, ax=None):
        """Generates a plot illustrating the steps of the network table search.
        Parameters
        ----------
        table : camelot.core.Table
        ax : matplotlib.axes.Axes (optional)
        Returns
        -------
        fig : matplotlib.fig.Figure
        """
        ax = prepare_plot(table, ax)
        if table.parse_details is None:
            return ax.get_figure()
        parse_details = table.parse_details
        for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
            max_h_gap = bbox_search["max_h_gap"]
            max_v_gap = bbox_search["max_v_gap"]
            iterations = bbox_search["iterations"]
            for iteration, bbox in enumerate(iterations):
                final = iteration == len(iterations) - 1
                draw_labeled_bbox(
                    ax, bbox,
                    "t{box_id}/i{iteration}".format(
                        box_id=box_id,
                        iteration=iteration
                    ),
                    color="red",
                    linewidth=5 if final else 2,
                    fontsize=12 if final else 8,
                    label_pos="bottom,left"
                )
                ax.add_patch(
                    patches.Rectangle(
                        (bbox[0]-max_h_gap, bbox[1]-max_v_gap),
                        bbox[2] - bbox[0] + 2 * max_h_gap,
                        bbox[3] - bbox[1] + 2 * max_v_gap,
                        color="orange",
                        fill=False
                    )
                )
        for box_id, col_search in enumerate(parse_details["col_searches"]):
            draw_labeled_bbox(
                ax, col_search["bbox_full"],
                "box body + header #{box_id}".format(
                    box_id=box_id
                ),
                color="red",
                linewidth=4,
                label_pos="top,left"
            )
            draw_labeled_bbox(
                ax, col_search["bbox_body"],
                "box body #{box_id}".format(
                    box_id=box_id
                ),
                color="orange",
                linewidth=2,
                label_pos="bottom,left"
            )
            for col_anchor in col_search["cols_anchors"]:
                # Display a green line at the col boundary line throughout the
                # table bbox.
                ax.plot(
                    [col_anchor, col_anchor],
                    [
                        col_search["bbox_body"][1] - 10,
                        col_search["bbox_body"][3] + 10,
                    ],
                    color="green"
                )
        return ax.get_figure()
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import os
 import atexit
 import re
 import random
 import shutil
@ -9,8 +10,10 @@ import tempfile
 import warnings
 from itertools import groupby
 from operator import itemgetter
 from urllib.request import Request
 import numpy as np
 import pandas as pd
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -27,7 +30,9 @@ from pdfminer.layout import (
    LTImage,
 )
-from urllib.request import Request, urlopen
+from .ext.ghostscript import Ghostscript
 from urllib.request import urlopen
 from urllib.parse import urlparse as parse_url
 from urllib.parse import uses_relative, uses_netloc, uses_params
@ -93,8 +98,21 @@ def download_url(url):
    return filepath
-stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
+common_kwargs = [
-lattice_kwargs = [
+    "flag_size",
    "margins",
    "split_text",
    "strip_text",
    "table_areas",
    "table_regions"
 ]
 text_kwargs = common_kwargs + [
    "columns",
    "edge_tol",
    "row_tol",
    "column_tol"
 ]
 lattice_kwargs = common_kwargs + [
    "process_background",
    "line_scale",
    "copy_text",
@ -106,42 +124,72 @@ lattice_kwargs = [
    "iterations",
    "resolution",
 ]
 flavor_to_kwargs = {
    "stream": text_kwargs,
    "network": text_kwargs,
    "lattice": lattice_kwargs,
    "hybrid": text_kwargs + lattice_kwargs,
 }
 def validate_input(kwargs, flavor="lattice"):
-    def check_intersection(parser_kwargs, input_kwargs):
+    parser_kwargs = flavor_to_kwargs[flavor]
-        isec = set(parser_kwargs).intersection(set(input_kwargs.keys()))
+    # s.difference(t): new set with elements in s but not in t
-        if isec:
+    isec = set(kwargs.keys()).difference(set(parser_kwargs))
-            raise ValueError(
+    if isec:
-                f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
+        raise ValueError(
-            )
+            f"{','.join(sorted(isec))} cannot be used with flavor='{flavor}'"
-
+        )
    if flavor == "lattice":
        check_intersection(stream_kwargs, kwargs)
    else:
        check_intersection(lattice_kwargs, kwargs)
 def remove_extra(kwargs, flavor="lattice"):
-    if flavor == "lattice":
+    parser_kwargs = flavor_to_kwargs[flavor]
-        for key in kwargs.keys():
+    # Avoid "dictionary changed size during iteration"
-            if key in stream_kwargs:
+    kwargs_keys = list(kwargs.keys())
-                kwargs.pop(key)
+    for key in kwargs_keys:
-    else:
+        if key not in parser_kwargs:
-        for key in kwargs.keys():
+            kwargs.pop(key)
            if key in lattice_kwargs:
                kwargs.pop(key)
    return kwargs
 # https://stackoverflow.com/a/22726782
-class TemporaryDirectory(object):
+# and https://stackoverflow.com/questions/10965479
 class TemporaryDirectory():
    def __init__(self):
        self.dir_path = None
    def __enter__(self):
-        self.name = tempfile.mkdtemp()
+        self.dir_path = tempfile.mkdtemp()
-        return self.name
+        # Only delete the temporary directory upon
        # program exit.
        atexit.register(shutil.rmtree, self.dir_path)
        return self.dir_path
    def __exit__(self, exc_type, exc_value, traceback):
-        shutil.rmtree(self.name)
+        pass
 def build_file_path_in_temp_dir(filename, extension=None):
    """Generates a new path within a temporary directory
    Parameters
    ----------
    filename : str
    extension : str
    Returns
    -------
    file_path_in_temporary_dir : str
    """
    with TemporaryDirectory() as temp_dir:
        if extension:
            filename = filename + extension
        path = os.path.join(
            temp_dir,
            filename
        )
    return path
 def translate(x1, x2):
@ -247,8 +295,9 @@ def scale_image(tables, v_segments, h_segments, factors):
        j_x, j_y = zip(*tables[k])
        j_x = [scale(j, scaling_factor_x) for j in j_x]
        j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
-        joints = zip(j_x, j_y)
+        tables_new[(x1, y1, x2, y2)] = {
-        tables_new[(x1, y1, x2, y2)] = joints
+            "joints": list(zip(j_x, j_y))
        }
    v_segments_new = []
    for v in v_segments:
@ -296,9 +345,10 @@ def get_rotation(chars, horizontal_text, vertical_text):
    hlen = len([t for t in horizontal_text if t.get_text().strip()])
    vlen = len([t for t in vertical_text if t.get_text().strip()])
    if hlen < vlen:
-        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
+        clockwise = sum(t.matrix[1] < 0 < t.matrix[2] for t in chars)
-        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
+        anticlockwise = sum(t.matrix[1] > 0 > t.matrix[2] for t in chars)
-        rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
+        rotation = "anticlockwise" if clockwise < anticlockwise \
            else "clockwise"
    return rotation
@ -329,18 +379,98 @@ def segments_in_bbox(bbox, v_segments, h_segments):
    v_s = [
        v
        for v in v_segments
-        if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
+        if v[1] > lb[1] - 2 and
        v[3] < rt[1] + 2 and
        lb[0] - 2 <= v[0] <= rt[0] + 2
    ]
    h_s = [
        h
        for h in h_segments
-        if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
+        if h[0] > lb[0] - 2 and
        h[2] < rt[0] + 2 and
        lb[1] - 2 <= h[1] <= rt[1] + 2
    ]
    return v_s, h_s
 def get_textline_coords(textline):
    """Calculate the coordinates of each alignment for a given textline.
    """
    return {
        "left": textline.x0,
        "right": textline.x1,
        "middle": (textline.x0 + textline.x1) / 2.0,
        "bottom": textline.y0,
        "top": textline.y1,
        "center": (textline.y0 + textline.y1) / 2.0,
    }
 def bbox_from_str(bbox_str):
    """Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
    Parameters
    ----------
    bbox_str : str
        Serialized bbox with comma separated coordinates, "x1,y1,x2,y2".
    Returns
    -------
    bbox : tuple
        Tuple (x1, y1, x2, y2).
    """
    x1, y1, x2, y2 = bbox_str.split(",")
    x1 = float(x1)
    y1 = float(y1)
    x2 = float(x2)
    y2 = float(y2)
    return (
        min(x1, x2),
        min(y1, y2),
        max(x1, x2),
        max(y1, y2)
    )
 def bboxes_overlap(bbox1, bbox2):
    (left1, bottom1, right1, top1) = bbox1
    (left2, bottom2, right2, top2) = bbox2
    return (
            (left1 < left2 < right1) or (left1 < right2 < right1)
        ) and (
            (bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
        )
 def textlines_overlapping_bbox(bbox, textlines):
    """Returns all text objects which overlap or are within a bounding box.
    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    textlines : List of PDFMiner text objects.
    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects.
    """
    t_bbox = [
        t
        for t in textlines
        if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
    ]
    return t_bbox
 def text_in_bbox(bbox, text):
-    """Returns all text objects present inside a bounding box.
+    """Returns all text objects which lie at least 50% inside a bounding box
    across both dimensions.
    Parameters
    ----------
@ -367,6 +497,214 @@ def text_in_bbox(bbox, text):
    return t_bbox
 def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
    """Returns all text objects present inside a bounding box, split between
    horizontal and vertical text.
    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    horizontal_text : List of PDFMiner text objects.
    vertical_text : List of PDFMiner text objects.
    Returns
    -------
    t_bbox : dict
        Dict of lists of PDFMiner text objects that lie inside table, with one
        key each for "horizontal" and "vertical"
    """
    t_bbox = {}
    t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text)
    t_bbox["vertical"] = text_in_bbox(bbox, vertical_text)
    t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
    t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
    return t_bbox
 def expand_bbox_with_textline(bbox, textline):
    """Expand (if needed) a bbox so that it fits the parameter textline.
    """
    return (
        min(bbox[0], textline.x0),
        min(bbox[1], textline.y0),
        max(bbox[2], textline.x1),
        max(bbox[3], textline.y1)
    )
 def bbox_from_textlines(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.
    Parameters
    ----------
    textlines : List of PDFMiner text objects.
    Returns
    -------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    """
    if len(textlines) == 0:
        return None
    bbox = (
        textlines[0].x0,
        textlines[0].y0,
        textlines[0].x1,
        textlines[0].y1
    )
    for tl in textlines[1:]:
        bbox = expand_bbox_with_textline(bbox, tl)
    return bbox
 def find_columns_boundaries(tls, min_gap=1.0):
    """Make a list of disjunct cols boundaries for a list of text objects
    Parameters
    ----------
    tls : list of PDFMiner text object.
    min_gap : minimum distance between columns. Any elements closer than
        this threshold are merged together.  This is to prevent spaces between
        words to be misinterpreted as boundaries.
    Returns
    -------
    boundaries : list
        List x-coordinates for cols.
         [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
    """
    cols_bounds = []
    tls.sort(key=lambda tl: tl.x0)
    for tl in tls:
        if (not cols_bounds) or cols_bounds[-1][1] + min_gap < tl.x0:
            cols_bounds.append([tl.x0, tl.x1])
        else:
            cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
    return cols_bounds
 def find_rows_boundaries(tls, min_gap=1.0):
    """Make a list of disjunct rows boundaries for a list of text objects
    Parameters
    ----------
    tls : list of PDFMiner text object.
    min_gap : minimum distance between rows. Any elements closer than
        this threshold are merged together.
    Returns
    -------
    boundaries : list
        List y-coordinates for rows.
         [(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
    """
    rows_bounds = []
    tls.sort(key=lambda tl: tl.y0)
    for tl in tls:
        if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
            rows_bounds.append([tl.y0, tl.y1])
        else:
            rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
    return rows_bounds
 def boundaries_to_split_lines(boundaries):
    """Find split lines given a list of boundaries between rows or cols.
    Boundaries:     [ a ]         [b]     [   c   ]  [d]
    Splits:         |        |         |            |  |
    Parameters
    ----------
    boundaries : list
        List of tuples of x- (for columns) or y- (for rows) coord boundaries.
        These are the (left, right most) or (bottom, top most) coordinates.
    Returns
    -------
    anchors : list
        List of coordinates representing the split points, each half way
        between boundaries
    """
    # From the row boundaries, identify splits by getting the mid points
    # between the boundaries.
    anchors = list(map(
        lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
        range(1, len(boundaries))
    ))
    anchors.insert(0, boundaries[0][0])
    anchors.append(boundaries[-1][1])
    return anchors
 def get_index_closest_point(point, sorted_list, fn=lambda x: x):
    """Return the index of the closest point in the sorted list.
    Parameters
    ----------
    point : the reference sortable element to search.
    sorted_list : list
    fn: optional accessor function
    Returns
    -------
    index : int
    """
    n = len(sorted_list)
    if n == 0:
        return None
    if n == 1:
        return 0
    left = 0
    right = n - 1
    mid = 0
    if point >= fn(sorted_list[n - 1]):
        return n - 1
    if point <= fn(sorted_list[0]):
        return 0
    while left < right:
        mid = (left + right) // 2  # find the mid
        mid_val = fn(sorted_list[mid])
        if point < mid_val:
            right = mid
        elif point > mid_val:
            left = mid + 1
        else:
            return mid
    if mid_val > point:
        if mid > 0 and (
                point - fn(sorted_list[mid-1]) <
                mid_val - point):
            return mid-1
    elif mid_val < point:
        if mid < n - 1 and (
                fn(sorted_list[mid+1]) - point <
                point - mid_val):
            return mid+1
    return mid
 def merge_close_lines(ar, line_tol=2):
    """Merges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.
@ -452,10 +790,10 @@ def flag_font_size(textline, direction, strip_text=""):
            for t in textline
            if not isinstance(t, LTAnno)
        ]
-    l = [np.round(size, decimals=6) for text, size in d]
+    text_sizes = [np.round(size, decimals=6) for text, size in d]
-    if len(set(l)) > 1:
+    if len(set(text_sizes)) > 1:
        flist = []
-        min_size = min(l)
+        min_size = min(text_sizes)
        for key, chars in groupby(d, itemgetter(1)):
            if key == min_size:
                fchars = [t[0] for t in chars]
@ -469,12 +807,12 @@ def flag_font_size(textline, direction, strip_text=""):
                    flist.append("".join(fchars))
        fstring = "".join(flist)
    else:
-        fstring = "".join([t.get_text() for t in textline])
+        fstring = "".join(t.get_text() for t in textline)
    return text_strip(fstring, strip_text)
 def split_textline(table, textline, direction, flag_size=False, strip_text=""):
-    """Splits PDFMiner LTTextLine into substrings if it spans across
+    """Split PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.
    Parameters
@ -499,7 +837,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
        of row/column and text is the an lttextline substring.
    """
    idx = 0
    cut_text = []
    bbox = textline.bbox
    try:
@ -516,7 +853,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            ]
            r = r_idx[0]
            x_cuts = [
-                (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
+                (c, table.cells[r][c].x2)
                for c in x_overlap
                if table.cells[r][c].right
            ]
            if not x_cuts:
                x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
@ -530,10 +869,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
                        ):
                            cut_text.append((r, cut[0], obj))
                            break
-                        else:
+                        # TODO: add test
-                            # TODO: add test
+                        if cut == x_cuts[-1]:
-                            if cut == x_cuts[-1]:
+                            cut_text.append((r, cut[0] + 1, obj))
                                cut_text.append((r, cut[0] + 1, obj))
                    elif isinstance(obj, LTAnno):
                        cut_text.append((r, cut[0], obj))
        elif direction == "vertical" and not textline.is_empty():
@ -549,7 +887,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            ]
            c = c_idx[0]
            y_cuts = [
-                (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
+                (r, table.cells[r][c].y1)
                for r in y_overlap
                if table.cells[r][c].bottom
            ]
            if not y_cuts:
                y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
@ -557,16 +897,13 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
                col = table.cols[c]
                for cut in y_cuts:
                    if isinstance(obj, LTChar):
-                        if (
+                        if col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] \
-                            col[0] <= (obj.x0 + obj.x1) / 2 <= col[1]
+                                and (obj.y0 + obj.y1) / 2 >= cut[1]:
                            and (obj.y0 + obj.y1) / 2 >= cut[1]
                        ):
                            cut_text.append((cut[0], c, obj))
                            break
-                        else:
+                        # TODO: add test
-                            # TODO: add test
+                        if cut == y_cuts[-1]:
-                            if cut == y_cuts[-1]:
+                            cut_text.append((cut[0] - 1, c, obj))
                                cut_text.append((cut[0] - 1, c, obj))
                    elif isinstance(obj, LTAnno):
                        cut_text.append((cut[0], c, obj))
    except IndexError:
@ -632,9 +969,8 @@ def get_table_index(
    """
    r_idx, c_idx = [-1] * 2
    for r in range(len(table.rows)):
-        if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
+        if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
-            r
+           (t.y0 + t.y1) / 2.0 > table.rows[r][1]:
        ][1]:
            lt_col_overlap = []
            for c in table.cols:
                if c[0] <= t.x1 and c[1] >= t.x0:
@ -648,7 +984,8 @@ def get_table_index(
                text_range = (t.x0, t.x1)
                col_range = (table.cols[0][0], table.cols[-1][1])
                warnings.warn(
-                    f"{text} {text_range} does not lie in column range {col_range}"
+                    f"{text} {text_range} does not lie in column range "
                    f"{col_range}"
                )
            r_idx = r
            c_idx = lt_col_overlap.index(max(lt_col_overlap))
@ -667,7 +1004,9 @@ def get_table_index(
    X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
    Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
    charea = X * Y
-    error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
+    error = (
        (X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
    ) / charea
    if split_text:
        return (
@ -676,20 +1015,21 @@ def get_table_index(
            ),
            error,
        )
-    else:
+    if flag_size:
-        if flag_size:
+        return (
-            return (
+            [
-                [
+                (
-                    (
+                    r_idx,
-                        r_idx,
+                    c_idx,
-                        c_idx,
+                    flag_font_size(t._objs,
-                        flag_font_size(t._objs, direction, strip_text=strip_text),
+                                   direction,
-                    )
+                                   strip_text=strip_text),
-                ],
+                )
-                error,
+            ],
-            )
+            error,
-        else:
+        )
-            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
+    return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
        error
 def compute_accuracy(error_weights):
@ -711,7 +1051,7 @@ def compute_accuracy(error_weights):
    SCORE_VAL = 100
    try:
        score = 0
-        if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
+        if sum(ew[0] for ew in error_weights) != SCORE_VAL:
            raise ValueError("Sum of weights should be equal to 100.")
        for ew in error_weights:
            weight = ew[0] / len(ew[1])
@ -737,7 +1077,6 @@ def compute_whitespace(d):
    """
    whitespace = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
        for j in i:
            if j.strip() == "":
@ -747,13 +1086,12 @@ def compute_whitespace(d):
 def get_page_layout(
-    filename,
+        filename,
-    char_margin=1.0,
+        char_margin=1.0,
-    line_margin=0.5,
+        line_margin=0.5,
-    word_margin=0.1,
+        word_margin=0.1,
-    detect_vertical=True,
+        detect_vertical=True,
-    all_texts=True,
+        all_texts=True):
 ):
    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
@ -797,6 +1135,7 @@ def get_page_layout(
            width = layout.bbox[2]
            height = layout.bbox[3]
            dim = (width, height)
            break  # we assume a single page pdf
        return layout, dim
@ -838,3 +1177,117 @@ def get_text_objects(layout, ltype="char", t=None):
    except AttributeError:
        pass
    return t
 def export_pdf_as_png(pdf_path, destination_path, resolution=300):
    """Generate an image from a pdf.
    Parameters
    ----------
    pdf_path : str
    destination_path : str
    resolution : int
    """
    gs_call = "-q -sDEVICE=png16m -o " \
        "{destination_path} -r{resolution} {pdf_path}" \
        .format(
            destination_path=destination_path,
            resolution=resolution,
            pdf_path=pdf_path
        )
    gs_call = gs_call.encode().split()
    null = open(os.devnull, "wb")
    Ghostscript(*gs_call, stdout=null)
    null.close()
 def compare_tables(left, right):
    """Compare two tables and displays differences in a human readable form.
    Parameters
    ----------
    left : data frame
    right : data frame
    """
    diff_cols = right.shape[1]-left.shape[1]
    diff_rows = right.shape[0]-left.shape[0]
    differences = []
    if diff_rows:
        differences.append(
            "{diff_rows} {more_fewer} rows".format(
                diff_rows=abs(diff_rows),
                more_fewer='more' if diff_rows > 0 else 'fewer'
            )
        )
    if diff_cols:
        differences.append(
            "{diff_cols} {more_fewer} columns".format(
                diff_cols=abs(diff_cols),
                more_fewer='more' if diff_cols > 0 else 'fewer'
            )
        )
    if differences:
        differences_str = " and ".join(differences)
        print(
            "Right has {differences_str} than left "
            "{shape_left} vs {shape_right}".format(
                differences_str=differences_str,
                shape_left=[left.shape[0], left.shape[1]],
                shape_right=[right.shape[0], right.shape[1]],
            )
        )
    table1, table2 = [left, right]
    name_table1, name_table2 = ["left", "right"]
    if not diff_cols:
        # Same number of cols: compare rows since they're of the same length
        if diff_rows > 0:
            # Use the longest table as a reference
            table1, table2 = table2, table1
            name_table1, name_table2 = name_table2, name_table1
        for index, lrow in table1.iterrows():
            if index < table2.shape[0]:
                srow = table2.loc[index, :]
                if not lrow.equals(srow):
                    diff_df = pd.DataFrame()
                    diff_df = diff_df.append(lrow, ignore_index=True)
                    diff_df = diff_df.append(srow, ignore_index=True)
                    diff_df.insert(0, 'Table', [name_table1, name_table2])
                    print("Row {index} differs:".format(index=index))
                    print(diff_df.values)
                    break
            else:
                print("Row {index} unique to {name_table1}: {lrow}".format(
                    index=index,
                    name_table1=name_table1,
                    lrow=lrow
                ))
                break
    elif not diff_rows:
        # Same number of rows: compare columns since they're of the same length
        if diff_cols > 0:
            # Use the longest table as a reference
            table1, table2 = table2, table1
            name_table1, name_table2 = name_table2, name_table1
        for i, col in enumerate(table1.columns):
            lcol = table1.iloc[:, i]
            if col in table2:
                scol = table2.iloc[:, i]
                if not lcol.equals(scol):
                    diff_df = pd.DataFrame()
                    diff_df[name_table1] = scol
                    diff_df[name_table2] = lcol
                    diff_df["Match"] = lcol == scol
                    print(
                        "Column {i} different:\n"
                        "{diff_df}".format(
                            i=i,
                            diff_df=diff_df
                        )
                    )
                    break
            else:
                print("Column {i} unique to {name_table1}: {lcol}")
                break
    else:
        print("Tables have different shapes")
--- a/docs/user/install.rst
+++ b/docs/user/install.rst
@ -13,7 +13,7 @@ The easiest way to install Camelot is to install it with `conda`_, which is a pa
    $ conda install -c conda-forge camelot-py
-.. note:: Camelot is available for Python 2.7, 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
+.. note:: Camelot is available for Python 3.5, 3.6 and 3.7 on Linux, macOS and Windows. For Windows, you will need to install ghostscript which you can get from their `downloads page`_.
 .. _conda: https://conda.io/docs/
 .. _Anaconda: http://docs.continuum.io/anaconda/
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
@ -0,0 +1,351 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hybrid Parser step-by-step\n",
    "\n",
    "This notebook describes the algorithms behind the hybrid parser, which blends the results of the network parser (text based) and the lattice parser (image based).\n",
    "\n",
    "You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it.  By default, it points to one of the test .pdfs included with camelot.\n",
    "\n",
    "You can also use the `parser-comparison-notebook` notebook to compare the parsers results side-by-side."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Bootstrap and common imports\n",
    "import os, sys, time\n",
    "sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
    "import camelot\n",
    "\n",
    "print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")\n",
    "\n",
    "# Select a pdf to analyze.\n",
    "kwargs = {}\n",
    "data = None\n",
    "# pdf_file = \"vertical_header.pdf\"  # test_network_vertical_header\n",
    "# pdf_file, kwargs = \"background_lines_1.pdf\", {} # {\"process_background\": True}  # test_lattice_process_background\n",
    "\n",
    "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_network_flag_size\n",
    "# pdf_file = \"health.pdf\"  # test_network\n",
    "# pdf_file = \"clockwise_table_2.pdf\"\n",
    "# pdf_file = \"tabula/12s0324.pdf\"  # interesting because contains two separate tables\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_network_table_regions\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_areas\": [\"320,500,573,335\"]} # test_network_table_areas\n",
    "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"}  # data_stream_strip_text\n",
    "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text  # data_stream_split_text\n",
    "# pdf_file = \"clockwise_table_2.pdf\"  # test_network_table_rotated / test_stream_table_rotated\n",
    "pdf_file = \"vertical_header.pdf\"\n",
    "\n",
    "# pdf_file = \"twotables_2.pdf\"\n",
    "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
    "\n",
    "filename = os.path.join(\n",
    "    os.path.dirname(os.path.abspath('.')),\n",
    "    \"camelot/tests/files\",\n",
    "    pdf_file\n",
    ")\n",
    "\n",
    "# Set up plotting options\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "PLOT_HEIGHT = 12\n",
    "def init_figure_and_axis(title):\n",
    "    fig = plt.figure(figsize=(PLOT_HEIGHT * 2.5, PLOT_HEIGHT))\n",
    "    ax = fig.add_subplot(111)\n",
    "    ax.set_title(title)\n",
    "    return fig, ax\n",
    "\n",
    "# Utility function to display tables\n",
    "def display_parse_results(tables, parse_time, flavor):\n",
    "    if not tables:\n",
    "        return\n",
    "    tables_dims = \", \".join(\n",
    "        map(\n",
    "            lambda table: \"{rows}x{cols}\".format(\n",
    "                rows=table.shape[0],\n",
    "                cols=table.shape[1],\n",
    "            ), tables\n",
    "        )\n",
    "    )\n",
    "    print(f\"The {flavor} parser found {len(tables)} table(s) ({tables_dims}) in {parse_time:.2f}s\")\n",
    "    for table in tables:\n",
    "        display(table.df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overall Algorithm\n",
    "\n",
    "The hybrid parser combines results from the network parser and the lattice parser to get the \"best of both worlds.\" Before we look at the combination itself, let's see how each of the two parsers work.\n",
    "\n",
    "### Network parser\n",
    "\n",
    "The network parser is text-based: it relies on the bounding boxes of the text elements encoded in the .pdf document to identify patterns indicative of a table.\n",
    "\n",
    "The plot belows shows the bounding boxes of all the text elements on the parsed document, in light blue for horizontal elements, light red for vertical elements (rare in most documents)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse file\n",
    "flavor = \"network\"\n",
    "timer_before_parse = time.perf_counter()\n",
    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "timer_after_parse = time.perf_counter()\n",
    "\n",
    "if tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(tables[0], kind=\"text\", ax=ax)\n",
    "else:\n",
    "    print(\"No table found for this document.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Network parser - step 1: Identify a network of connected alignments\n",
    "\n",
    "The network parser starts by identifying common horizontal (shown in green on the plot below) or vertical (in blue) coordinate alignments across these text elements.  In other words it looks for bounding box rectangles which either share the same top, center, or bottom coordinates (horizontal axis), or the same left, right, or middle coordinates (vertical axis). See the `generate` method.\n",
    "\n",
    "Once the parser found these alignments, it performs some pruning to only keep text elements that are part of a network - they have connections along both axis  The idea is that it's not enough for two elements to be aligned to belong to a table, for instance the lines of text in this paragraph are all left-aligned, but they do not form a network.  The pruning is done iteratively, see `remove_unconnected_edges` method.\n",
    "\n",
    "Once the network is pruned, the parser keeps track of how many alignments each text element belongs to: that's the number on top (vertical alignments) or to the left of each alignment in the plot below.  The text element with the most connections (in red on the plot) is the starting point -the *seed*- of the next step.  Finally, the parser measures how far the alignments are from one another, to determine a plausible search zone around each cell for the next stage of growing the table. See `compute_plausible_gaps` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n",
    "else:\n",
    "    print(f\"No table found for document {pdf_file}.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Network parser - step 2: Detect table body iteratively from seed\n",
    "\n",
    "In the next step, the parser iteratively \"grows\" a table, starting from the seed identified in the previous step. The bounding box is initialized with the bounding box of the seed, then it iteratively searches for text elements that are close to the bounding box, then grows the table to ingest them, until there are no more text elements to ingest.  The two steps are:\n",
    "* Search: create a search bounding box by expanding the current table bounding box in all directions, based on the plausible gap numbers determined above.  Search bounding boxes are shown in orange on the graph below.  \n",
    "* Grow: if a networked text element is found in this search area, expand the table bounding box so that it includes this new element.  Each successive table bounding box is shown in red in the plot below.\n",
    "\n",
    "Notice in the plot below how the search area and the table bounding box grow starting from the seed. See method `search_table_body`.\n",
    "\n",
    "#### Network parser - step 3: Search for a header section\n",
    "\n",
    "Headers are often aligned differently from the rest of the table.  To account for this, the network parser searches for text elements that are good candidates for a header section: these text elements are just above the bounding box of the body of the table, and they fit within the rows identified in the table body.  See the method `search_header_from_body_bbox`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
    "else:\n",
    "    print(\"No table found for this document.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Network parser - step 4: Repeat\n",
    "\n",
    "There are sometimes multiple tables on one page.  So once a first table is identified, all the text edges it contains are removed, and the algorithm is repeated until no new network is identified.\n",
    "\n",
    "The final parse for this .pdf is as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lattice parser\n",
    "\n",
    "The lattice parser is based on an analyzis of the image from the .pdf, rather than its text content.  It relies on the borders of the tables to be solid vertical lines.\n",
    "\n",
    "#### Lattice parser - step 1: Identify solid lines within the document.\n",
    "\n",
    "The lattice parser relies on the OpenCV library (`getStructuringElement` function) to detect all solid vertical and horizontal lines within the document."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse file\n",
    "flavor = \"lattice\"\n",
    "timer_before_parse = time.perf_counter()\n",
    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "timer_after_parse = time.perf_counter()\n",
    "\n",
    "if tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(tables[0], kind=\"line\", ax=ax)\n",
    "else:\n",
    "    print(\"No table found for this document.\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lattice parser - step 2: Find the contours of the table(s) based on the solid lines.\n",
    "\n",
    "The lattice parser then uses OpenCV's `findContours` function to detect the overall bounding box of the table(s), since the solid lines might draw more than one table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for table in tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(table, kind=\"contour\", ax=ax)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lattice parser - step 3: Identify joints\n",
    "\n",
    "For each table bounding box (contour), the lattice parser then makes a list of all the intersections between vertical and horizontal lines: the joints."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for table in tables:\n",
    "    fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n",
    "    camelot.plot(table, kind=\"joint\", ax=ax)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lattice parser - step 4: Identify rows and columns\n",
    "\n",
    "In the final step, the algorithm sorts all the x coordinates of the joints to identify the position of the table's columns, and the y coordinates for the table's rows.  See method `_generate_columns_and_rows`.\n",
    "\n",
    "The resulting lattice parse for the .pdf is as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combining results of Network and Lattice with the Hybrid parser\n",
    "\n",
    "The hybrid parser aims to combine the strengths of the Network parser (identifying cells based on text alignments) and of the Lattice parser (relying on solid lines to determine tables rows and columns boundaries).\n",
    "\n",
    "#### Hybrid parser - step 1: Apply both parsers table bounding box detection techniques to the document\n",
    "\n",
    "In this step, hybrid calls both parsers, to get a) the standard table parse, b) the coordinates of the rows and columns boundaries, and c) the table boundaries (or contour).\n",
    "\n",
    "#### Hybrid parser - step 2: Merge the results\n",
    "\n",
    "If there are areas in the document where both lattice and network found a table, the hybrid parser uses the results from network, but enhances them based on the rows/columns boundaries identified by lattice in the area.  Because lattice uses the solid lines detected on the document, the coordinates for b) and c) detected by Lattice are generally more precise. See the `_merge_bbox_analysis` method.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "flavor = \"hybrid\"\n",
    "timer_before_parse = time.perf_counter()\n",
    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "timer_after_parse = time.perf_counter()\n",
    "\n",
    "display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python37764bit8418972e58f441528b05b4b21a1f095d",
   "display_name": "Python 3.7.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
@ -0,0 +1,201 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parser comparison\n",
    "\n",
    "This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Bootstrap and common imports\n",
    "import os, sys, time\n",
    "sys.path.insert(0, os.path.abspath('')) # Prefer the local version of camelot if available\n",
    "import camelot\n",
    "\n",
    "print(f\"Using Camelot v{camelot.__version__} from file {camelot.__file__}.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select a PDF file to review\n",
    "\n",
    "This is seeded with the unit test files for convenience."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kwargs = {}\n",
    "data = None\n",
    "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size # test_hybrid_flag_size\n",
    "# pdf_file = \"health.pdf\"  # test_hybrid\n",
    "# pdf_file = \"clockwise_table_2.pdf\"\n",
    "\n",
    "# pdf_file = \"tabula/12s0324.pdf\" # interesting because contains two separate tables\n",
    "\n",
    "# pdf_file = \"clockwise_table_2.pdf\"  # test_hybrid_table_rotated / test_stream_table_rotated\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_hybrid_table_regions\n",
    "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"}  # data_stream_strip_text\n",
    "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text  # data_stream_split_text\n",
    "pdf_file = \"vertical_header.pdf\"\n",
    "\n",
    "# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"2\"}\n",
    "\n",
    "# pdf_file, kwargs = \"PIR_Prospetto.dOfferta.pdf\", {\"pages\": \"6\"}\n",
    "# pdf_file = \"twotables_2.pdf\" # Lattice is better\n",
    "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
    "# pdf_file, kwargs = \"tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf\", {\"pages\": \"2\"}  # test_lattice\n",
    "# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True}  # test_lattice_process_background\n",
    "\n",
    "filename = os.path.join(\n",
    "    os.path.dirname(os.path.abspath('.')),\n",
    "    \"camelot/tests/files\",\n",
    "    pdf_file\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
    "tables_parsed = {}\n",
    "parses = {}\n",
    "max_tables = 0\n",
    "for idx, flavor in enumerate(FLAVORS):\n",
    "    timer_before_parse = time.perf_counter()\n",
    "    error, tables = None, []\n",
    "    try:\n",
    "        tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "    except ValueError as value_error:\n",
    "        error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
    "        print(error)\n",
    "    timer_after_parse = time.perf_counter()\n",
    "    max_tables = max(max_tables, len(tables))\n",
    "\n",
    "    parses[flavor] = {\n",
    "        \"tables\": tables,\n",
    "        \"time\": timer_after_parse - timer_before_parse,\n",
    "        \"error\": error\n",
    "    }\n",
    "\n",
    "    print(f\"##### {flavor} ####\")\n",
    "    print(f\"Found {len(tables)} table(s):\")\n",
    "    for idx, table in enumerate(tables):\n",
    "        flavors_matching = []\n",
    "        for previous_flavor, previous_tables in tables_parsed.items():\n",
    "            for prev_idx, previous_table in enumerate(previous_tables):\n",
    "                if previous_table.df.equals(table.df):\n",
    "                    flavors_matching.append(\n",
    "                        f\"{previous_flavor} table {prev_idx}\")\n",
    "        print(f\"## Table {idx} ##\")\n",
    "        if flavors_matching:\n",
    "            print(f\"Same as {', '.join(flavors_matching)}.\")\n",
    "        else:\n",
    "            display(table.df)\n",
    "            print(\"\")\n",
    "    tables_parsed[flavor] = tables\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show tables layout within original document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "# Set up plotting options\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "PLOT_HEIGHT = 12\n",
    "\n",
    "row_count = max(max_tables, 1)\n",
    "plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n",
    "fig, axes = plt.subplots(row_count, len(FLAVORS))\n",
    "plt.subplots_adjust(wspace=0, hspace=0) # Reduce margins to maximize the display zone\n",
    "\n",
    "fig.suptitle('Side-by-side flavor comparison', fontsize=24, fontweight='bold')\n",
    "for idx, flavor in enumerate(FLAVORS):\n",
    "    parse = parses[flavor]\n",
    "    tables = parse[\"tables\"]\n",
    "    top_ax = axes.flat[idx]\n",
    "    title = f\"{flavor}\\n\" \\\n",
    "            f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n",
    "    if parse['error']:\n",
    "        title = title + f\"\\nError parsing: {parse['error']}\"\n",
    "    top_ax.set_title(title, fontsize=12, fontweight='bold')\n",
    "    for table_idx, table in enumerate(tables):\n",
    "        if max_tables > 1:\n",
    "            ax = axes[table_idx][idx]\n",
    "        else:\n",
    "            ax = axes[idx]\n",
    "        fig = camelot.plot(table, kind='grid', ax=ax)\n",
    "        ax.text(\n",
    "            0.5,-0.1, \n",
    "            \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n",
    "                flavor=flavor,\n",
    "                table_idx=table_idx,\n",
    "                rows=table.shape[0],\n",
    "                cols=table.shape[1],\n",
    "            ), \n",
    "            size=14, ha=\"center\", \n",
    "            transform=ax.transAxes\n",
    "        )\n",
    "        timer_after_plot = time.perf_counter()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python37764bit8418972e58f441528b05b4b21a1f095d",
   "display_name": "Python 3.7.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -5,6 +5,6 @@ numpy>=1.13.3
 opencv-python>=3.4.2.17
 openpyxl>=2.5.8
 pandas>=0.23.4
-pdfminer.six>=20170720
+pdfminer.six>=20200402
 PyPDF2>=1.26.0
 Sphinx>=1.7.9
--- a/setup.cfg
+++ b/setup.cfg
@ -3,4 +3,6 @@ test=pytest
 [tool:pytest]
 addopts = --verbose --cov-config .coveragerc --cov-report term --cov-report xml --cov=camelot --mpl
 # Switch to no-cov if you want to debug a test with breakpoints.
 # addopts = --verbose --mpl
 python_files = tests/test_*.py
--- a/setup.py
+++ b/setup.py
@ -19,7 +19,7 @@ requires = [
    'numpy>=1.13.3',
    'openpyxl>=2.5.8',
    'pandas>=0.23.4',
-    'pdfminer.six>=20170720',
+    'pdfminer.six>=20200402',
    'PyPDF2>=1.26.0'
 ]
@ -32,12 +32,12 @@ plot_requires = [
 ]
 dev_requires = [
-    'codecov>=2.0.15',
+    'codecov>=2.1.3',
-    'pytest>=3.8.0',
+    'pytest>=4.6',
-    'pytest-cov>=2.6.0',
+    'pytest-cov>=2.10.0',
-    'pytest-mpl>=0.10',
+    'pytest-mpl>=0.11',
-    'pytest-runner>=4.2',
+    'pytest-runner>=5.2',
-    'Sphinx>=1.7.9'
+    'Sphinx>=3.0.3'
 ]
 all_requires = cv_requires + plot_requires
@ -69,7 +69,7 @@ def setup_package():
                    },
                    classifiers=[
                        # Trove classifiers
-                        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+                        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers # noqa
                        'License :: OSI Approved :: MIT License',
                        'Programming Language :: Python :: 3.6',
                        'Programming Language :: Python :: 3.7',
--- a/tests/data.py
+++ b/tests/data.py
--- a/tests/files/PIR_Prospetto.dOfferta.pdf
+++ b/tests/files/PIR_Prospetto.dOfferta.pdf
--- a/tests/files/baseline_plots/test_grid_plot.png
+++ b/tests/files/baseline_plots/test_grid_plot.png
--- a/tests/files/baseline_plots/test_joint_plot.png
+++ b/tests/files/baseline_plots/test_joint_plot.png
--- a/tests/files/baseline_plots/test_lattice_contour_plot.png
+++ b/tests/files/baseline_plots/test_lattice_contour_plot.png
--- a/tests/files/baseline_plots/test_line_plot.png
+++ b/tests/files/baseline_plots/test_line_plot.png
--- a/tests/files/baseline_plots/test_network_contour_plot.png
+++ b/tests/files/baseline_plots/test_network_contour_plot.png
--- a/tests/files/baseline_plots/test_network_grid_plot.png
+++ b/tests/files/baseline_plots/test_network_grid_plot.png
--- a/tests/files/baseline_plots/test_network_table_areas_text_plot.png
+++ b/tests/files/baseline_plots/test_network_table_areas_text_plot.png
--- a/tests/files/baseline_plots/test_network_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_network_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_network_textedge_plot.png
+++ b/tests/files/baseline_plots/test_network_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_contour_plot.png
+++ b/tests/files/baseline_plots/test_stream_contour_plot.png
--- a/tests/files/baseline_plots/test_stream_grid_plot.png
+++ b/tests/files/baseline_plots/test_stream_grid_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png
--- a/tests/files/baseline_plots/test_text_plot.png
+++ b/tests/files/baseline_plots/test_text_plot.png
--- a/tests/files/baseline_plots/test_textedge_plot.png
+++ b/tests/files/baseline_plots/test_textedge_plot.png
--- a/tests/files/camelot-issue-132-multiple-tables.pdf
+++ b/tests/files/camelot-issue-132-multiple-tables.pdf
--- a/tests/files/vertical_header.pdf
+++ b/tests/files/vertical_header.pdf
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -19,10 +19,16 @@ def test_help_output():
    output = result.output
    assert prog_name == "camelot"
-    assert result.output.startswith("Usage: %(prog_name)s [OPTIONS] COMMAND" % locals())
+    assert result.output.startswith(
        "Usage: %(prog_name)s [OPTIONS] COMMAND" %
        locals()
    )
    assert all(
        v in result.output
-        for v in ["Options:", "--version", "--help", "Commands:", "lattice", "stream"]
+        for v in [
            "Options:", "--version", "--help", "Commands:", "lattice",
            "stream"
        ]
    )
@ -66,6 +72,26 @@ def test_cli_stream():
        assert format_error in result.output
 def test_cli_network():
    with TemporaryDirectory() as tempdir:
        infile = os.path.join(testdir, "budget.pdf")
        outfile = os.path.join(tempdir, "budget.csv")
        runner = CliRunner()
        result = runner.invoke(
            cli, ["--format", "csv", "--output", outfile, "network", infile]
        )
        assert result.exit_code == 0
        assert result.output == "Found 1 tables\n"
        result = runner.invoke(cli, ["--format", "csv", "network", infile])
        output_error = "Error: Please specify output file path using --output"
        assert output_error in result.output
        result = runner.invoke(cli, ["--output", outfile, "network", infile])
        format_error = "Please specify output file format using --format"
        assert format_error in result.output
 def test_cli_password():
    with TemporaryDirectory() as tempdir:
        infile = os.path.join(testdir, "health_protected.pdf")
@ -121,7 +147,8 @@ def test_cli_output_format():
        outfile = os.path.join(tempdir, "health.json")
        result = runner.invoke(
            cli,
-            ["--format", "json", "--output", outfile, "stream", infile],
+            ["--format", "json", "--output", outfile.format("json"), "stream",
             infile],
        )
        assert result.exit_code == 0
@ -129,7 +156,8 @@ def test_cli_output_format():
        outfile = os.path.join(tempdir, "health.xlsx")
        result = runner.invoke(
            cli,
-            ["--format", "excel", "--output", outfile, "stream", infile],
+            ["--format", "excel", "--output", outfile.format("xlsx"), "stream",
             infile],
        )
        assert result.exit_code == 0
@ -137,7 +165,8 @@ def test_cli_output_format():
        outfile = os.path.join(tempdir, "health.html")
        result = runner.invoke(
            cli,
-            ["--format", "html", "--output", outfile, "stream", infile],
+            ["--format", "html", "--output", outfile.format("html"), "stream",
             infile],
        )
        assert result.exit_code == 0
@ -170,6 +199,10 @@ def test_cli_quiet():
        assert "No tables found on page-1" in result.output
        result = runner.invoke(
-            cli, ["--quiet", "--format", "csv", "--output", outfile, "stream", infile]
+            cli,
            [
                "--quiet", "--format", "csv", "--output", outfile, "stream",
                infile
            ]
        )
        assert "No tables found on page-1" not in result.output
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -8,15 +8,20 @@ from pandas.testing import assert_frame_equal
 import camelot
 from camelot.core import Table, TableList
 from camelot.__version__ import generate_version
 #  compare_tables used in console mode while debugging
 from camelot.utils import compare_tables  # noqa
 from .data import *
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
 def test_parsing_report():
-    parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
+    parsing_report = {
        "accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
    }
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
@ -28,9 +33,11 @@ def test_password():
    filename = os.path.join(testdir, "health_protected.pdf")
    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
    tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
@ -143,6 +150,194 @@ def test_stream_layout_kwargs():
    assert_frame_equal(df, tables[0].df)
 def test_network():
    df = pd.DataFrame(data_stream)
    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)
 def test_network_table_rotated():
    df = pd.DataFrame(data_network_table_rotated)
    filename = os.path.join(testdir, "clockwise_table_2.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)
    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)
 def test_network_two_tables_a():
    df1 = pd.DataFrame(data_network_two_tables_1)
    df2 = pd.DataFrame(data_network_two_tables_2)
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)
 # Reported as https://github.com/camelot-dev/camelot/issues/132
 def test_network_two_tables_b():
    df1 = pd.DataFrame(data_network_two_tables_b_1)
    df2 = pd.DataFrame(data_network_two_tables_b_2)
    filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)
 def test_network_vertical_header():
    """Tests a complex table with a vertically text header.
    """
    df = pd.DataFrame(data_network_vertical_headers)
    filename = os.path.join(testdir, "vertical_header.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
 def test_network_table_regions():
    df = pd.DataFrame(data_network_table_regions)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    # The "stream" test looks for a region in ["320,460,573,335"], which
    # should exclude the header.
    tables = camelot.read_pdf(
        filename, flavor="network", table_regions=["320,335,573,505"]
    )
    assert_frame_equal(df, tables[0].df)
 def test_network_table_areas():
    df = pd.DataFrame(data_stream_table_areas)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
        filename, flavor="network", table_areas=["320,500,573,335"]
    )
    assert_frame_equal(df, tables[0].df)
 def test_network_columns():
    df = pd.DataFrame(data_stream_columns)
    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
        filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
    )
    assert_frame_equal(df, tables[0].df)
 def test_network_split_text():
    df = pd.DataFrame(data_network_split_text)
    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
        filename,
        flavor="network",
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert_frame_equal(df, tables[0].df)
 def test_network_flag_size():
    df = pd.DataFrame(data_network_flag_size)
    filename = os.path.join(testdir, "superscript.pdf")
    tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
    assert_frame_equal(df, tables[0].df)
 def test_network_strip_text():
    df = pd.DataFrame(data_network_strip_text)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
    assert_frame_equal(df, tables[0].df)
 def test_network_edge_tol():
    df = pd.DataFrame(data_network_edge_tol)
    filename = os.path.join(testdir, "edge_tol.pdf")
    tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
    assert_frame_equal(df, tables[0].df)
 def test_network_layout_kwargs():
    df = pd.DataFrame(data_stream_layout_kwargs)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
        filename, flavor="network", layout_kwargs={"detect_vertical": False}
    )
    assert_frame_equal(df, tables[0].df)
 # Hybrid parser
 def test_hybrid():
    df = pd.DataFrame(data_hybrid)
    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
    assert_frame_equal(df, tables[0].df)
 def test_hybrid_two_tables():
    df1 = pd.DataFrame(data_network_two_tables_1)
    df2 = pd.DataFrame(data_network_two_tables_2)
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)
 def test_hybrid_vertical_header():
    """Tests a complex table with a vertically text header.
    """
    df = pd.DataFrame(data_hybrid_vertical_headers)
    filename = os.path.join(testdir, "vertical_header.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
 def test_hybrid_process_background():
    df = pd.DataFrame(data_hybrid_process_background)
    filename = os.path.join(testdir, "background_lines_1.pdf")
    tables = camelot.read_pdf(
        filename, flavor="hybrid", process_background=True)
    assert_frame_equal(df, tables[1].df)
 def test_hybrid_split_text():
    df = pd.DataFrame(data_network_split_text)
    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
        filename,
        flavor="hybrid",
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert_frame_equal(df, tables[0].df)
 # Lattice parser tests
 def test_lattice():
    df = pd.DataFrame(data_lattice)
@ -229,9 +424,9 @@ def test_repr():
    tables = camelot.read_pdf(filename)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
+    assert \
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) == \
-    )
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_pages():
@ -239,22 +434,23 @@ def test_pages():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
+    assert \
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) == \
-    )
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    tables = camelot.read_pdf(url, pages="1-end")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
+    assert \
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) == \
-    )
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    tables = camelot.read_pdf(url, pages="all")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) ==
        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
@ -264,7 +460,8 @@ def test_url():
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) ==
        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
@ -284,7 +481,12 @@ def test_table_order():
        return t
    table_list = TableList(
-        [_make_table(2, 1), _make_table(1, 1), _make_table(3, 4), _make_table(1, 2)]
+        [
            _make_table(2, 1),
            _make_table(1, 1),
            _make_table(3, 4),
            _make_table(1, 2)
        ]
    )
    assert [(t.page, t.order) for t in sorted(table_list)] == [
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@ -14,32 +14,33 @@ filename = os.path.join(testdir, "foo.pdf")
 def test_unknown_flavor():
-    message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
+    message = ("Unknown flavor specified."
               " Use either 'lattice', 'stream', or 'network'")
    with pytest.raises(NotImplementedError, match=message):
-        tables = camelot.read_pdf(filename, flavor="chocolate")
+        camelot.read_pdf(filename, flavor='chocolate')
 def test_input_kwargs():
    message = "columns cannot be used with flavor='lattice'"
    with pytest.raises(ValueError, match=message):
-        tables = camelot.read_pdf(filename, columns=["10,20,30,40"])
+        camelot.read_pdf(filename, columns=['10,20,30,40'])
 def test_unsupported_format():
    message = "File format not supported"
    filename = os.path.join(testdir, "foo.csv")
    with pytest.raises(NotImplementedError, match=message):
-        tables = camelot.read_pdf(filename)
+        camelot.read_pdf(filename)
 def test_stream_equal_length():
    message = "Length of table_areas and columns" " should be equal"
    with pytest.raises(ValueError, match=message):
-        tables = camelot.read_pdf(
+        camelot.read_pdf(
            filename,
-            flavor="stream",
+            flavor='stream',
-            table_areas=["10,20,30,40"],
+            table_areas=['10,20,30,40'],
-            columns=["10,20,30,40", "10,20,30,40"],
+            columns=['10,20,30,40', '10,20,30,40']
        )
@ -48,11 +49,9 @@ def test_image_warning():
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        with pytest.raises(UserWarning) as e:
-            tables = camelot.read_pdf(filename)
+            camelot.read_pdf(filename)
-            assert (
+        assert str(e.value) == 'page-1 is image-based, camelot only works ' \
-                str(e.value)
+            'on text-based pages.'
                == "page-1 is image-based, camelot only works on text-based pages."
            )
 def test_no_tables_found():
@ -60,8 +59,8 @@ def test_no_tables_found():
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        with pytest.raises(UserWarning) as e:
-            tables = camelot.read_pdf(filename)
+            camelot.read_pdf(filename)
-        assert str(e.value) == "No tables found on page-1"
+        assert str(e.value) == 'No tables found on page-1'
 def test_no_tables_found_logs_suppressed():
@ -70,7 +69,7 @@ def test_no_tables_found_logs_suppressed():
        # the test should fail if any warning is thrown
        warnings.simplefilter("error")
        try:
-            tables = camelot.read_pdf(filename, suppress_stdout=True)
+            camelot.read_pdf(filename, suppress_stdout=True)
        except Warning as e:
            warning_text = str(e)
            pytest.fail(f"Unexpected warning: {warning_text}")
@ -82,7 +81,7 @@ def test_no_tables_found_warnings_suppressed():
        # the test should fail if any warning is thrown
        warnings.simplefilter("error")
        try:
-            tables = camelot.read_pdf(filename, suppress_stdout=True)
+            camelot.read_pdf(filename, suppress_stdout=True)
        except Warning as e:
            warning_text = str(e)
            pytest.fail(f"Unexpected warning: {warning_text}")
@ -92,11 +91,11 @@ def test_no_password():
    filename = os.path.join(testdir, "health_protected.pdf")
    message = "file has not been decrypted"
    with pytest.raises(Exception, match=message):
-        tables = camelot.read_pdf(filename)
+        camelot.read_pdf(filename)
 def test_bad_password():
    filename = os.path.join(testdir, "health_protected.pdf")
    message = "file has not been decrypted"
    with pytest.raises(Exception, match=message):
-        tables = camelot.read_pdf(filename, password="wrongpass")
+        camelot.read_pdf(filename, password='wrongpass')
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -3,58 +3,144 @@
 import os
 import pytest
 import matplotlib
 import camelot
 # The version of Matplotlib has an impact on some of the tests.  Unfortunately,
 # we can't enforce usage of a recent version of MatplotLib without dropping
 # support for Python 3.6.
 # To check the version of matplotlib installed:
 #   pip freeze | grep matplotlib
 # To force upgrade:
 #   pip install --upgrade --force-reinstall matplotlib
 # To force usage of a Python 3.6 compatible version:
 #   pip install "matplotlib==3.0.3"
 # This condition can be removed in favor of a version requirement bump for
 # matplotlib once support for Python 3.5 is dropped.
 LEGACY_MATPLOTLIB = matplotlib.__version__ < "3.2.1"
 # Bump the default plot tolerance from 2 to account for cross-platform testing
 # via Travis, and resulting minor font changes.
 TOLERANCE = 4
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+def unit_test_stable_plot(table, kind):
    if not LEGACY_MATPLOTLIB:
        # See https://matplotlib.org/3.2.1/users/whats_new.html#kerning-adjustments-now-use-correct-values  # noqa
        matplotlib.rcParams["text.kerning_factor"] = 6
    return camelot.plot(table, kind=kind)
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_text_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
-    return camelot.plot(tables[0], kind="text")
+    return unit_test_stable_plot(tables[0], 'text')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_grid_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
-    return camelot.plot(tables[0], kind="grid")
+    return unit_test_stable_plot(tables[0], 'grid')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_stream_grid_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
    return unit_test_stable_plot(tables[0], 'grid')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_network_grid_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename, flavor="network")
    return unit_test_stable_plot(tables[0], 'grid')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_lattice_contour_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
-    return camelot.plot(tables[0], kind="contour")
+    return unit_test_stable_plot(tables[0], 'contour')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_stream_contour_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor='stream')
-    return camelot.plot(tables[0], kind="contour")
+    return unit_test_stable_plot(tables[0], 'contour')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_network_contour_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor='network')
    return unit_test_stable_plot(tables[0], 'contour')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_line_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
-    return camelot.plot(tables[0], kind="line")
+    return unit_test_stable_plot(tables[0], 'line')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_joint_plot():
    filename = os.path.join(testdir, "foo.pdf")
    tables = camelot.read_pdf(filename)
-    return camelot.plot(tables[0], kind="joint")
+    return unit_test_stable_plot(tables[0], 'joint')
-@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
+@pytest.mark.mpl_image_compare(
-def test_textedge_plot():
+    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_stream_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor='stream')
-    return camelot.plot(tables[0], kind="textedge")
+    return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_network_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, debug=True, flavor='network')
    return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_network_table_regions_textedge_plot():
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
        filename, debug=True, flavor="network",
        table_regions=["320,505,573,330"]
    )
    return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
 def test_network_table_areas_text_plot():
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
        filename, debug=True, flavor="network",
        table_areas=["320,500,573,335"]
    )
    return unit_test_stable_plot(tables[0], 'text')