Moved duplicated common code to base objects

* Move table initialization common areas to BaseParser * Stop relying on intermediate file name for source page index * Create table comparison utility function to help in debugging * Generate pdf as images in stream mode plots * Fix pylint errors
2020-04-10 16:02:00 -07:00 · 2020-04-10 16:02:00 -07:00 · 467c4a3de0
parent dff9f5cd82
commit 467c4a3de0
17 changed files with 402 additions and 153 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -10,6 +10,11 @@ from operator import itemgetter
 import numpy as np
 import pandas as pd

+from .utils import (
+    compute_accuracy,
+    compute_whitespace,
+)
+

 # minimum number of vertical textline intersections for a textedge
 # to be considered valid
@ -479,6 +484,9 @@ class Table(object):
        self.whitespace = 0
        self.order = None
        self.page = None
+        self.flavor = None      # Flavor of the parser that generated the table
+        self.pdf_size = None    # Dimensions of the original PDF page
+        self.debug_info = None  # Field holding debug data

    def __repr__(self):
        return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -513,6 +521,17 @@ class Table(object):
        }
        return report

+    def fill_data(self, parser):
+        self.flavor = parser.id
+        self.debug_info = parser.debug_info
+        data = self.data
+        self.df = pd.DataFrame(data)
+        self.shape = self.df.shape
+
+        self.whitespace = compute_whitespace(data)
+
+        self.pdf_size = (parser.pdf_width, parser.pdf_height)
+
    def set_all_edges(self):
        """Sets all table edges to True.
        """
@ -747,6 +766,7 @@ class Table(object):
            "encoding": "utf-8",
        }
        kw.update(kwargs)
+        # pylint: disable=abstract-class-instantiated
        writer = pd.ExcelWriter(path)
        self.df.to_excel(writer, **kw)
        writer.save()
@ -874,6 +894,7 @@ class TableList(object):
                self._compress_dir(**kwargs)
        elif f == "excel":
            filepath = os.path.join(dirname, basename)
+            # pylint: disable=abstract-class-instantiated
            writer = pd.ExcelWriter(filepath)
            for table in self._tables:
                sheet_name = "page-{}-table-{}".format(table.page, table.order)
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -101,26 +101,32 @@ class PDFHandler(object):
        temp : str
            Tmp directory.

+        Returns
+        -------
+        fpath : str
+            The path of the single page PDF created.
+
        """
+        fpath = os.path.join(temp, "page-{0}.pdf".format(page))
        with open(filepath, "rb") as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
-            fpath = os.path.join(temp, "page-{0}.pdf".format(page))
            froot, fext = os.path.splitext(fpath)
            p = infile.getPage(page - 1)
            outfile = PdfFileWriter()
            outfile.addPage(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
-            layout, dim = get_page_layout(fpath)
+            layout, __ = get_page_layout(fpath)
            # fix rotated PDF
            chars = get_text_objects(layout, ltype="char")
            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
            vertical_text = get_text_objects(layout, ltype="vertical_text")
            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != "":
-                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
+                fpath_new = "".join(
+                    [froot.replace("page", "p"), "_rotated", fext])
                os.rename(fpath, fpath_new)
                infile = PdfFileReader(open(fpath_new, "rb"), strict=False)
                if infile.isEncrypted:
@ -134,9 +140,11 @@ class PDFHandler(object):
                outfile.addPage(p)
                with open(fpath, "wb") as f:
                    outfile.write(f)
+        return fpath

    def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
+        self, flavor="lattice", suppress_stdout=False,
+        layout_kwargs={}, **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
@ -149,7 +157,7 @@ class PDFHandler(object):
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
-            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.  # noqa
        kwargs : dict
            See camelot.read_pdf kwargs.

@ -161,15 +169,22 @@ class PDFHandler(object):
        """
        tables = []
        with TemporaryDirectory() as tempdir:
-            for p in self.pages:
-                self._save_page(self.filepath, p, tempdir)
-            pages = [
-                os.path.join(tempdir, "page-{0}.pdf".format(p)) for p in self.pages
-            ]
-            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
-            for p in pages:
+            parser = \
+                Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
+
+            # For each of the pages we need to parse, generate a single page
+            # .pdf in a temporary folder.
+            for page_idx in self.pages:
+                single_page_pdf_file = self._save_page(
+                    self.filepath,
+                    page_idx,
+                    tempdir
+                )
                t = parser.extract_tables(
-                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+                    single_page_pdf_file,
+                    page_idx,
+                    suppress_stdout=suppress_stdout,
+                    layout_kwargs=layout_kwargs
                )
                tables.extend(t)
        return TableList(sorted(tables))
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -2,11 +2,13 @@

 from __future__ import division

-import cv2
+from cv2 import cv2
 import numpy as np


-def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
+def adaptive_threshold(
+    imagename, process_background=False, blocksize=15, c=-2
+):
    """Thresholds an image using OpenCV's adaptiveThreshold.

    Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.

-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.

-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa

    Returns
    -------
@ -39,7 +41,9 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):

    if process_background:
        threshold = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
+            gray, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, blocksize, c
        )
    else:
        threshold = cv2.adaptiveThreshold(
@ -54,7 +58,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):


 def find_lines(
-    threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
+    threshold, regions=None, direction="horizontal",
+    line_scale=15, iterations=0
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
@ -78,7 +83,7 @@ def find_lines(
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.

-        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa

    Returns
    -------
@ -100,13 +105,14 @@ def find_lines(
        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
-        raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
+        raise ValueError("Specify direction as either 'vertical' "
+                         "or 'horizontal'")

    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
-            region_mask[y : y + h, x : x + w] = 1
+            region_mask[y: y + h, x: x + w] = 1
        threshold = np.multiply(threshold, region_mask)

    threshold = cv2.erode(threshold, el)
@ -115,12 +121,16 @@ def find_lines(

    try:
        _, contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8),
+            cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
        )
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8),
+            cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
        )

    for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
    tables = {}
    for c in contours:
        x, y, w, h = c
-        roi = joints[y : y + h, x : x + w]
+        roi = joints[y: y + h, x: x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -2,19 +2,94 @@

 import os

-from ..utils import get_page_layout, get_text_objects
+from ..utils import (
+    get_page_layout,
+    get_text_objects
+)
+from ..core import Table
+
+from ..image_processing import (
+    adaptive_threshold,
+    find_lines,
+    find_contours,
+    find_joints
+)
+
+# Pylint can't detect contents of cv2
+from cv2 import imread  # pylint: disable=no-name-in-module


 class BaseParser(object):
    """Defines a base parser.
    """
+    def __init__(self, parser_id):
+        self.imagename = None
+        self.pdf_image = None
+        self.id = parser_id

-    def _generate_layout(self, filename, layout_kwargs):
+        # For plotting details of parsing algorithms
+        self.debug_info = {}
+
+    def _generate_layout(self, filename, page_idx, layout_kwargs):
        self.filename = filename
        self.layout_kwargs = layout_kwargs
-        self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
+        self.layout, self.dimensions = get_page_layout(
+            filename,
+            **layout_kwargs
+        )
        self.images = get_text_objects(self.layout, ltype="image")
-        self.horizontal_text = get_text_objects(self.layout, ltype="horizontal_text")
-        self.vertical_text = get_text_objects(self.layout, ltype="vertical_text")
+        self.horizontal_text = get_text_objects(
+            self.layout,
+            ltype="horizontal_text"
+        )
+        self.vertical_text = get_text_objects(
+            self.layout,
+            ltype="vertical_text"
+        )
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
+
+        self.page = page_idx
+
+    def generate_image(self):
+        if self.pdf_image is None:
+            self._generate_image_file()
+            self.pdf_image = imread(self.imagename)
+
+    def _generate_image_file(self):
+        if self.imagename:
+            return
+        from ..ext.ghostscript import Ghostscript
+
+        self.imagename = "".join([self.rootname, ".png"])
+        gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
+            self.imagename, self.filename
+        )
+        gs_call = gs_call.encode().split()
+        null = open(os.devnull, "wb")
+        Ghostscript(*gs_call, stdout=null)
+        # with Ghostscript(*gs_call, stdout=null) as gs:
+        #     pass
+        null.close()
+
+    """Initialize new table object, ready to be populated
+
+    Parameters
+    ----------
+    table_idx : int
+        Index of this table within the pdf page analyzed
+    cols : list
+        list of coordinate boundaries tuples (left, right)
+    rows : list
+        list of coordinate boundaries tuples (bottom, top)
+
+    Returns
+    -------
+    t : camelot.core.Table
+
+    """
+    def _initialize_new_table(self, table_idx, cols, rows):
+        table = Table(cols, rows)
+        table.page = self.page
+        table.order = table_idx + 1
+        return table
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -13,7 +13,6 @@ import numpy as np
 import pandas as pd

 from .base import BaseParser
-from ..core import Table
 from ..utils import (
    scale_image,
    scale_pdf,
@ -22,7 +21,6 @@ from ..utils import (
    merge_close_lines,
    get_table_index,
    compute_accuracy,
-    compute_whitespace,
 )
 from ..image_processing import (
    adaptive_threshold,
@ -80,7 +78,7 @@ class Lattice(BaseParser):
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.

-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
@ -114,6 +112,7 @@ class Lattice(BaseParser):
        resolution=300,
        **kwargs
    ):
+        super().__init__("lattice")
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
@ -208,19 +207,6 @@ class Lattice(BaseParser):
                                t.cells[i][j].text = t.cells[i - 1][j].text
        return t

-    def _generate_image(self):
-        from ..ext.ghostscript import Ghostscript
-
-        self.imagename = "".join([self.rootname, ".png"])
-        gs_call = "-q -sDEVICE=png16m -o {} -r300 {}".format(
-            self.imagename, self.filename
-        )
-        gs_call = gs_call.encode().split()
-        null = open(os.devnull, "wb")
-        with Ghostscript(*gs_call, stdout=null) as gs:
-            pass
-        null.close()
-
    def _generate_table_bbox(self):
        def scale_areas(areas):
            scaled_areas = []
@ -234,20 +220,21 @@ class Lattice(BaseParser):
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas

-        self.image, self.threshold = adaptive_threshold(
+        self.pdf_image, self.threshold = adaptive_threshold(
            self.imagename,
            process_background=self.process_background,
            blocksize=self.threshold_blocksize,
            c=self.threshold_constant,
        )

-        image_width = self.image.shape[1]
-        image_height = self.image.shape[0]
+        image_width = self.pdf_image.shape[1]
+        image_height = self.pdf_image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
        image_height_scaler = image_height / float(self.pdf_height)
        pdf_width_scaler = self.pdf_width / float(image_width)
        pdf_height_scaler = self.pdf_height / float(image_height)
-        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
+        image_scalers = (image_width_scaler,
+                         image_height_scaler, self.pdf_height)
        pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)

        if self.table_areas is None:
@ -291,7 +278,11 @@ class Lattice(BaseParser):

        self.table_bbox_unscaled = copy.deepcopy(table_bbox)

-        self.table_bbox, self.vertical_segments, self.horizontal_segments = scale_image(
+        [
+            self.table_bbox,
+            self.vertical_segments,
+            self.horizontal_segments
+        ] = scale_image(
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )

@ -315,7 +306,10 @@ class Lattice(BaseParser):
        rows.extend([tk[1], tk[3]])
        # sort horizontal and vertical segments
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
-        rows = merge_close_lines(sorted(rows, reverse=True), line_tol=self.line_tol)
+        rows = merge_close_lines(
+            sorted(rows, reverse=True),
+            line_tol=self.line_tol
+        )
        # make grid using x and y coord of shortlisted rows and cols
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -328,7 +322,7 @@ class Lattice(BaseParser):
        if v_s is None or h_s is None:
            raise ValueError("No segments found on {}".format(self.rootname))

-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, cols, rows)
        # set table edges to True using ver+hor lines
        table = table.set_edges(v_s, h_s, joint_tol=self.joint_tol)
        # set table border edges to True
@ -359,48 +353,44 @@ class Lattice(BaseParser):
        accuracy = compute_accuracy([[100, pos_errors]])

        if self.copy_text is not None:
-            table = Lattice._copy_spanning_text(table, copy_text=self.copy_text)
+            table = Lattice._copy_spanning_text(
+                table,
+                copy_text=self.copy_text
+            )

-        data = table.data
-        table.df = pd.DataFrame(data)
-        table.shape = table.df.shape
-
-        whitespace = compute_whitespace(data)
-        table.flavor = "lattice"
+        table.fill_data(self)
        table.accuracy = accuracy
-        table.whitespace = whitespace
-        table.order = table_idx + 1
-        table.page = int(os.path.basename(self.rootname).replace("page-", ""))

        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
-        table._image = (self.image, self.table_bbox_unscaled)
+        table._image = (self.pdf_image, self.table_bbox_unscaled)
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None

        return table

-    def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
-        self._generate_layout(filename, layout_kwargs)
+    def extract_tables(self, filename,  page_idx=1, suppress_stdout=False,
+                       layout_kwargs={}):
+        self._generate_layout(filename, page_idx, layout_kwargs)
        if not suppress_stdout:
-            logger.info("Processing {}".format(os.path.basename(self.rootname)))
+            logger.info(f"Processing {os.path.basename(self.rootname)}")

        if not self.horizontal_text:
            if self.images:
                warnings.warn(
-                    "{} is image-based, camelot only works on"
-                    " text-based pages.".format(os.path.basename(self.rootname))
+                    f"{os.path.basename(self.rootname)} is image-based, "
+                    "camelot only works on text-based pages."
                )
            else:
                warnings.warn(
-                    "No tables found on {}".format(os.path.basename(self.rootname))
+                    f"No tables found on {os.path.basename(self.rootname)}"
                )
            return []

-        self._generate_image()
+        self._generate_image_file()
        self._generate_table_bbox()

        _tables = []
@ -408,8 +398,10 @@ class Lattice(BaseParser):
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
-            cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
-            table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
+            cols, rows, v_s, h_s = self._generate_columns_and_rows(
+                table_idx, tk)
+            table = self._generate_table(
+                table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = tk
            _tables.append(table)

--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -9,7 +9,7 @@ import numpy as np
 import pandas as pd

 from .base import BaseParser
-from ..core import TextEdges, Table
+from ..core import TextEdges
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
                     compute_whitespace)

@ -69,6 +69,7 @@ class Stream(BaseParser):
        column_tol=0,
        **kwargs
    ):
+        super().__init__("stream")
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.columns = columns
@ -120,21 +121,26 @@ class Stream(BaseParser):
            Two-dimensional list of text objects grouped into rows.

        """
-        row_y = 0
+        row_y = None
        rows = []
        temp = []
-        for t in text:
+        non_empty_text = [t for t in text if t.get_text().strip()]
+        for t in non_empty_text:
            # is checking for upright necessary?
-            # if t.get_text().strip() and all([obj.upright for obj in t._objs
+            # if t.get_text().strip() and all([obj.upright \
+            #   for obj in t._objs
            # if type(obj) is LTChar]):
-            if t.get_text().strip():
-                if not np.isclose(row_y, t.y0, atol=row_tol):
+            if row_y is not None and \
+              not np.isclose(row_y, t.y0, atol=row_tol) and \
+              0.5 * (t.y1 + t.y0) < row_y:
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
+            # We update the row's bottom as we go, to be forgiving if there
+            # is a gradual change across multiple columns.
            row_y = t.y0
+
            temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
-        __ = rows.pop(0)  # TODO: hacky
        return rows

    @staticmethod
@ -278,7 +284,7 @@ class Stream(BaseParser):
    def _nurminen_table_detection(self, textlines):
        """A general implementation of the table detection algorithm
        described by Anssi Nurminen's master's thesis.
-        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
+        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa

        Assumes that tables are situated relatively far apart
        vertically.
@ -378,12 +384,29 @@ class Stream(BaseParser):
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
-            cols = [
-                (t.x0, t.x1) for r in rows_grouped if len(r) == ncols
-                for t in r
+
+            # Identify rows which contain the mode of the number of columns
+            full_rows = list(filter(
+                lambda row: len(row) == ncols,
+                rows_grouped))
+            cells_on_full_rows_xrange = [
+                (t.x0, t.x1) for r in full_rows for t in r
            ]
-            cols = self._merge_columns(sorted(cols),
+            # TODO: fixme / make a decision on this
+            # plausible_rows = list(filter(
+            #     lambda row: len(row) <= ncols*1.2 and len(row) >= ncols*.8,
+            #     rows_grouped))
+            # plausible_cells_xrange = [
+            #     (t.x0, t.x1) for r in plausible_rows for t in r
+            # ]
+            # self.debug_info['plausible_rows'] = plausible_rows
+
+            # Identify column boundaries based on the contents of these rows
+            cols = self._merge_columns(sorted(cells_on_full_rows_xrange),
                                       column_tol=self.column_tol)
+            # cols = self._merge_columns(sorted(plausible_cells_xrange),
+            #                            column_tol=self.column_tol)
+
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
@ -409,7 +432,7 @@ class Stream(BaseParser):
        return cols, rows

    def _generate_table(self, table_idx, cols, rows, **kwargs):
-        table = Table(cols, rows)
+        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()

        pos_errors = []
@ -431,31 +454,25 @@ class Stream(BaseParser):
                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])

-        data = table.data
-        table.df = pd.DataFrame(data)
-        table.shape = table.df.shape
+        table.fill_data(self)

-        whitespace = compute_whitespace(data)
-        table.flavor = "stream"
        table.accuracy = accuracy
-        table.whitespace = whitespace
-        table.order = table_idx + 1
-        table.page = int(os.path.basename(self.rootname).replace("page-", ""))

        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
-        table._image = None
+        self.generate_image()
+        table._image = (self.pdf_image, self.table_bbox)
        table._segments = None
        table._textedges = self.textedges

        return table

-    def extract_tables(self, filename, suppress_stdout=False,
+    def extract_tables(self, filename, page_idx=1, suppress_stdout=False,
                       layout_kwargs={}):
-        self._generate_layout(filename, layout_kwargs)
+        self._generate_layout(filename, page_idx, layout_kwargs)
        if not suppress_stdout:
            logger.info("Processing {}".format(
                os.path.basename(self.rootname)))
@ -474,6 +491,8 @@ class Stream(BaseParser):
                )
            return []

+        # Identify plausible areas within the doc where tables lie,
+        # populate table_bbox keys with these areas.
        self._generate_table_bbox()

        _tables = []
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -37,7 +37,7 @@ class PlotMethods(object):
            raise NotImplementedError(
                "Lattice flavor does not support kind='{}'".format(kind)
            )
-        elif table.flavor == "stream" and kind in ["joint", "line"]:
+        elif table.flavor == "stream" and kind in ["line"]:
            raise NotImplementedError(
                "Stream flavor does not support kind='{}'".format(kind)
            )
@ -64,9 +64,18 @@ class PlotMethods(object):
        for t in table._text:
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
-            ax.add_patch(patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1]))
+            ax.add_patch(
+                patches.Rectangle(
+                        (t[0], t[1]),
+                        t[2] - t[0],
+                        t[3] - t[1],
+                        alpha=0.5
+                    )
+                )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
+        img, __ = table._image
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

    def grid(self, table):
@ -94,6 +103,9 @@ class PlotMethods(object):
                    ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
                if cell.bottom:
                    ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
+
+        img, __ = table._image
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

    def contour(self, table):
@ -109,12 +121,8 @@ class PlotMethods(object):
        fig : matplotlib.fig.Figure

        """
-        try:
        img, table_bbox = table._image
-            _FOR_LATTICE = True
-        except TypeError:
-            img, table_bbox = (None, {table._bbox: None})
-            _FOR_LATTICE = False
+        _FOR_LATTICE = table.flavor == "lattice"
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")

@ -132,7 +140,8 @@ class PlotMethods(object):
        for t in table_bbox.keys():
            ax.add_patch(
                patches.Rectangle(
-                    (t[0], t[1]), t[2] - t[0], t[3] - t[1], fill=False, color="red"
+                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
+                    fill=False, color="red"
                )
            )
            if not _FOR_LATTICE:
@ -143,6 +152,8 @@ class PlotMethods(object):

        if _FOR_LATTICE:
            ax.imshow(img)
+        else:
+            ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

    def textedge(self, table):
@ -164,7 +175,11 @@ class PlotMethods(object):
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
            ax.add_patch(
-                patches.Rectangle((t[0], t[1]), t[2] - t[0], t[3] - t[1], color="blue")
+                patches.Rectangle(
+                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
+                    color="blue",
+                    alpha=0.5
+                )
            )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
@ -172,6 +187,8 @@ class PlotMethods(object):
        for te in table._textedges:
            ax.plot([te.x, te.x], [te.y0, te.y1])

+        img, __ = table._image
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

    def joint(self, table):
@ -220,4 +237,8 @@ class PlotMethods(object):
            ax.plot([v[0], v[2]], [v[1], v[3]])
        for h in horizontal:
            ax.plot([h[0], h[2]], [h[1], h[3]])
+
+        img, __ = table._image
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
+
        return fig
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -13,6 +13,7 @@ from itertools import groupby
 from operator import itemgetter

 import numpy as np
+import pandas as pd
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -30,6 +31,9 @@ from pdfminer.layout import (
 )


+# pylint: disable=import-error
+# PyLint will evaluate both branches, and will necessarily complain about one
+# of them.
 PY3 = sys.version_info[0] >= 3
 if PY3:
    from urllib.request import urlopen
@ -310,7 +314,8 @@ def get_rotation(chars, horizontal_text, vertical_text):
    if hlen < vlen:
        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
-        rotation = "anticlockwise" if clockwise < anticlockwise else "clockwise"
+        rotation = "anticlockwise" if clockwise < anticlockwise \
+            else "clockwise"
    return rotation


@ -341,12 +346,16 @@ def segments_in_bbox(bbox, v_segments, h_segments):
    v_s = [
        v
        for v in v_segments
-        if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2
+        if v[1] > lb[1] - 2 and
+        v[3] < rt[1] + 2 and
+        lb[0] - 2 <= v[0] <= rt[0] + 2
    ]
    h_s = [
        h
        for h in h_segments
-        if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2
+        if h[0] > lb[0] - 2 and
+        h[2] < rt[0] + 2 and
+        lb[1] - 2 <= h[1] <= rt[1] + 2
    ]
    return v_s, h_s

@ -464,10 +473,10 @@ def flag_font_size(textline, direction, strip_text=""):
            for t in textline
            if not isinstance(t, LTAnno)
        ]
-    l = [np.round(size, decimals=6) for text, size in d]
-    if len(set(l)) > 1:
+    text_sizes = [np.round(size, decimals=6) for text, size in d]
+    if len(set(text_sizes)) > 1:
        flist = []
-        min_size = min(l)
+        min_size = min(text_sizes)
        for key, chars in groupby(d, itemgetter(1)):
            if key == min_size:
                fchars = [t[0] for t in chars]
@ -511,7 +520,6 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
        of row/column and text is the an lttextline substring.

    """
-    idx = 0
    cut_text = []
    bbox = textline.bbox
    try:
@ -528,7 +536,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            ]
            r = r_idx[0]
            x_cuts = [
-                (c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right
+                (c, table.cells[r][c].x2)
+                for c in x_overlap
+                if table.cells[r][c].right
            ]
            if not x_cuts:
                x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
@ -561,7 +571,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
            ]
            c = c_idx[0]
            y_cuts = [
-                (r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom
+                (r, table.cells[r][c].y1)
+                for r in y_overlap
+                if table.cells[r][c].bottom
            ]
            if not y_cuts:
                y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
@ -644,9 +656,8 @@ def get_table_index(
    """
    r_idx, c_idx = [-1] * 2
    for r in range(len(table.rows)):
-        if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and (t.y0 + t.y1) / 2.0 > table.rows[
-            r
-        ][1]:
+        if (t.y0 + t.y1) / 2.0 < table.rows[r][0] and \
+           (t.y0 + t.y1) / 2.0 > table.rows[r][1]:
            lt_col_overlap = []
            for c in table.cols:
                if c[0] <= t.x1 and c[1] >= t.x0:
@ -681,7 +692,9 @@ def get_table_index(
    X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
    Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
    charea = X * Y
-    error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
+    error = (
+        (X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))
+    ) / charea

    if split_text:
        return (
@ -697,13 +710,16 @@ def get_table_index(
                    (
                        r_idx,
                        c_idx,
-                        flag_font_size(t._objs, direction, strip_text=strip_text),
+                        flag_font_size(t._objs,
+                                       direction,
+                                       strip_text=strip_text),
                    )
                ],
                error,
            )
        else:
-            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
+            return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], \
+                error


 def compute_accuracy(error_weights):
@ -751,7 +767,6 @@ def compute_whitespace(d):

    """
    whitespace = 0
-    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
        for j in i:
            if j.strip() == "":
@ -852,3 +867,78 @@ def get_text_objects(layout, ltype="char", t=None):
    except AttributeError:
        pass
    return t
+
+
+def compare_tables(left, right):
+    """Compare two tables and displays differences in a human readable form.
+
+    Parameters
+    ----------
+    left : data frame
+    right : data frame
+    """
+    diff_cols = right.shape[1]-left.shape[1]
+    diff_rows = right.shape[0]-left.shape[0]
+    differences = []
+    if (diff_rows):
+        differences.append(
+            f"{abs(diff_rows)} "
+            f"{'more' if diff_rows>0 else 'fewer'} rows"
+        )
+    if (diff_cols):
+        differences.append(
+            f"{abs(diff_cols)} "
+            f"{'more' if diff_cols>0 else 'fewer'} columns"
+        )
+    if differences:
+        differences_str = " and ".join(differences)
+        print(f"Right has {differences_str} than left "
+              f"[{right.shape[0]},{right.shape[1]}] vs "
+              f"[{left.shape[0]},{left.shape[1]}]")
+
+    table1, table2 = [left, right]
+    name_table1, name_table2 = ["left", "right"]
+    if not diff_rows:
+        # Same number of rows: compare columns since they're of the same length
+        if diff_cols > 0:
+            # Use the longest table as a reference
+            table1, table2 = table2, table1
+            name_table1, name_table2 = name_table2, name_table1
+        for i, col in enumerate(table1.columns):
+            lcol = table1.iloc[:, i]
+            if col in table2:
+                scol = table2.iloc[:, i]
+                if not lcol.equals(scol):
+                    diff_df = pd.DataFrame()
+                    diff_df[name_table1] = scol
+                    diff_df[name_table2] = lcol
+                    diff_df["Match"] = lcol == scol
+                    print(
+                        f"Column {i} different:\n"
+                        f"{diff_df}"
+                    )
+                    break
+            else:
+                print("Column {i} unique to {name_table1}: {lcol}")
+                break
+    elif not diff_cols:
+        # Same number of cols: compare rows since they're of the same length
+        if diff_rows > 0:
+            # Use the longest table as a reference
+            table1, table2 = table2, table1
+            name_table1, name_table2 = name_table2, name_table1
+        for i in table1.iterrows():
+            lrow = table1.loc[i, :]
+            if i < table2.shape[1]:
+                srow = table2.loc[i, :]
+                if not lrow.equals(srow):
+                    diff_df = pd.DataFrame()
+                    diff_df = diff_df.append(lrow, ignore_index=True)
+                    diff_df = diff_df.append(srow, ignore_index=True)
+                    diff_df.insert(0, 'Table', [name_table1, name_table2])
+                    print(f"Column {i} differs:")
+                    print(diff_df.values)
+                    break
+            else:
+                print(f"Row {i} unique to {name_table1}: {lrow}")
+                break
--- a/tests/data.py
+++ b/tests/data.py
@ -838,7 +838,7 @@ data_stream_two_tables_1 = [
        "2,330 .9",
    ],
    [
-        "Violent crime   .  .  .  .  .  .  .  .\n .  .\n .  .\n .  .\n" \
+        "Violent crime   .  .  .  .  .  .  .  .\n .  .\n .  .\n .  .\n"
        " .  .\n .  .",
        "467 .9",
        "69 .1",
@ -1503,15 +1503,8 @@ data_stream_table_areas = [
 ]

 data_stream_columns = [
-    [
-        "Clave",
-        "Nombre Entidad",
-        "Clave",
-        "Nombre Municipio",
-        "Clave",
-        "Nombre Localidad",
-    ],
-    ["Entidad", "", "Municipio", "", "Localidad", ""],
+    ["Clave \nEntidad", "Nombre Entidad", "Clave \nMunicipio",
+     "Nombre Municipio", "Clave \nLocalidad", "Nombre Localidad"],
    ["01", "Aguascalientes", "001", "Aguascalientes", "0094", "Granja Adelita"],
    ["01", "Aguascalientes", "001", "Aguascalientes", "0096", "Agua Azul"],
    ["01", "Aguascalientes", "001", "Aguascalientes", "0100", "Rancho Alegre"],
@ -2732,11 +2725,9 @@ data_stream_vertical_headers = [
    ['', '', '', '', '', '', '', '', '', '', '', 'Congress-',
        'Senator 36th', 'Rep106th', '', 'Reg. of', '', 'Road', '', '',
        'Distri', 'Dist', '', '', 'Dist'],
-    ['', '', '', '', '', '', '', '', '', '', '1st Dist', '', 'Dist.',
-        'Dist.', '', 'Deeds', '', 'Commission', '', 'District #1',
-        'ct #2', '#3', 'Dist #4', '', '#5'],
-    ['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '', '',
-        '', '', '', '', '', '', '', '', '', '', '', '', '', ''],
+    ['', '', '', '', '', 'Governor', '', '', 'U.S. Senator', '',
+        '1st Dist', '', 'Dist.', 'Dist.', '', 'Deeds', '', 'Commission',
+        '', 'District #1', 'ct #2', '#3', 'Dist #4', '', '#5'],
    ['', 'Number of Registered voters', 'Poll Book Totals',
        'Brian Calley', 'Patrick Colbeck', 'Jim Hines', 'Bill Schuette',
        'John James', 'Sandy Pensler', '', 'Jack Bergman', '',
--- a/tests/files/baseline_plots/test_grid_plot.png
+++ b/tests/files/baseline_plots/test_grid_plot.png
--- a/tests/files/baseline_plots/test_line_plot.png
+++ b/tests/files/baseline_plots/test_line_plot.png
--- a/tests/files/baseline_plots/test_stream_contour_plot.png
+++ b/tests/files/baseline_plots/test_stream_contour_plot.png
--- a/tests/files/baseline_plots/test_stream_grid_plot.png
+++ b/tests/files/baseline_plots/test_stream_grid_plot.png
--- a/tests/files/baseline_plots/test_text_plot.png
+++ b/tests/files/baseline_plots/test_text_plot.png
--- a/tests/files/baseline_plots/test_textedge_plot.png
+++ b/tests/files/baseline_plots/test_textedge_plot.png
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -9,10 +9,12 @@ from pandas.testing import assert_frame_equal

 import camelot
 from camelot.core import Table, TableList
+from camelot.utils import compare_tables
 from camelot.__version__ import generate_version

 from .data import *

+
 import pdfminer

 # The version of PDFMiner has an impact on some of the tests.  Unfortunately,
@ -48,9 +50,11 @@ def test_password():

    filename = os.path.join(testdir, "health_protected.pdf")
    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
+    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)

    tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
+    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)


@ -59,6 +63,7 @@ def test_stream():

    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
+    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)


@ -79,6 +84,7 @@ def test_stream_table_rotated():

    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
+    assert len(tables) == 1
    result_without_first_row = pd.DataFrame(
        tables[0].df.drop(tables[0].df.columns[0], axis=1).values)
    assert_frame_equal(df, result_without_first_row)
@ -275,9 +281,9 @@ def test_repr():
    tables = camelot.read_pdf(filename)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
-    )
+    assert \
+        repr(tables[0].cells[0][0]) == \
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"


 def test_pages():
@ -285,22 +291,23 @@ def test_pages():
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
-    )
+    assert \
+        repr(tables[0].cells[0][0]) == \
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"

    tables = camelot.read_pdf(url, pages="1-end")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
-    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
-    )
+    assert \
+        repr(tables[0].cells[0][0]) == \
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"

    tables = camelot.read_pdf(url, pages="all")
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) ==
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )


@ -310,7 +317,8 @@ def test_url():
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
-        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+        repr(tables[0].cells[0][0]) ==
+        "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )


--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -43,6 +43,13 @@ def test_grid_plot():
    tables = camelot.read_pdf(filename)
    return camelot.plot(tables[0], kind='grid')

+@pytest.mark.mpl_image_compare(
+    baseline_dir="files/baseline_plots", remove_text=True)
+def test_stream_grid_plot():
+    filename = os.path.join(testdir, "foo.pdf")
+    tables = camelot.read_pdf(filename, flavor="stream")
+    return camelot.plot(tables[0], kind='grid')
+

@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True)