WIP: Introduce actual hybrid parser

Create hybrid parser leverage both lattice and network techniques. Simplify plotting of pdf in lattice case. Rename "parser.table_bbox" into "parser.table_bbox_parses", since it represents not a bbox but a dict of bbox to corresponding parsing data. Still missing: more unit tests, plotting of steps.
2020-05-04 16:27:01 -07:00 · 2020-05-04 16:27:01 -07:00 · 77d289bd86
parent 6711f877bf
commit 77d289bd86
17 changed files with 1011 additions and 217 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
                "Please specify output file format using --format")
    tables = read_pdf(
-        filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
+        filepath, pages=pages, flavor="network",
        suppress_stdout=quiet, **kwargs
    )
    click.echo("Found {} tables".format(tables.n))
    if plot_type is not None:
--- a/camelot/core.py
+++ b/camelot/core.py
@ -454,7 +454,9 @@ class Table():
        self.page = None
        self.flavor = None         # Flavor of the parser used
        self.pdf_size = None       # Dimensions of the original PDF page
-        self.parse_details = None  # Field holding debug data
+        self._bbox = None          # Bounding box in original document
        self.parse = None          # Parse information
        self.parse_details = None  # Field holding extra debug data
        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -7,7 +7,7 @@ import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
-from .parsers import Stream, Lattice, Network
+from .parsers import Stream, Lattice, Network, Hybrid
 from .utils import (
    build_file_path_in_temp_dir,
    get_page_layout,
@ -23,6 +23,7 @@ PARSERS = {
    "lattice": Lattice,
    "stream": Stream,
    "network": Network,
    "hybrid": Hybrid,
 }
@ -177,7 +178,8 @@ class PDFHandler():
        Parameters
        ----------
        flavor : str (default: 'lattice')
-            The parsing method to use ('lattice', 'stream', or 'network').
+            The parsing method to use ('lattice', 'stream', 'network',
            or 'hybrid').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -6,7 +6,9 @@ import cv2
 import numpy as np
-def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
+def adaptive_threshold(
        imagename, process_background=False,
        blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
    Returns
    -------
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    if process_background:
        threshold = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
+            gray,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, blocksize, c
        )
    else:
        threshold = cv2.adaptiveThreshold(
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
 def find_lines(
-    threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
+    threshold, regions=None,
    direction="horizontal", line_scale=15, iterations=0
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
@ -78,7 +84,7 @@ def find_lines(
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
-        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.  # noqa
    Returns
    -------
@ -100,13 +106,15 @@ def find_lines(
        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
-        raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
+        raise ValueError(
            "Specify direction as either 'vertical' or 'horizontal'"
        )
    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
-            region_mask[y : y + h, x : x + w] = 1
+            region_mask[y:y + h, x:x + w] = 1
        threshold = np.multiply(threshold, region_mask)
    threshold = cv2.erode(threshold, el)
@ -115,12 +123,14 @@ def find_lines(
    try:
        _, contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
            cv2.CHAIN_APPROX_SIMPLE
        )
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
            cv2.CHAIN_APPROX_SIMPLE
        )
    for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
    tables = {}
    for c in contours:
        x, y, w, h = c
-        roi = joints[y : y + h, x : x + w]
+        roi = joints[y:y + h, x:x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
--- a/camelot/io.py
+++ b/camelot/io.py
@ -99,7 +99,7 @@ def read_pdf(
    """
    layout_kwargs = layout_kwargs or {}
-    if flavor not in ["lattice", "stream", "network"]:
+    if flavor not in ["lattice", "stream", "network", "hybrid"]:
        raise NotImplementedError(
            "Unknown flavor specified."
            " Use either 'lattice', 'stream', or 'network'"
--- a/camelot/parsers/init.py
+++ b/camelot/parsers/init.py
@ -3,3 +3,4 @@
 from .stream import Stream
 from .lattice import Lattice
 from .network import Network
 from .hybrid import Hybrid
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -34,8 +34,9 @@ class BaseParser():
        self.id = parser_id
        self.table_regions = table_regions
        self.table_areas = table_areas
-        self.table_bbox = {}
+        self.table_bbox_parses = {}
        self.columns = None
        self.copy_text = copy_text
        self.split_text = split_text
        self.strip_text = strip_text
@ -47,10 +48,18 @@ class BaseParser():
        self.t_bbox = None
        # For plotting details of parsing algorithms
        self.resolution = 300  # default plotting resolution of the PDF.
        self.parse_details = {}
        if not debug:
            self.parse_details = None
    def table_bboxes(self):
        return sorted(
            self.table_bbox_parses.keys(),
            key=lambda x: x[1],
            reverse=True
        )
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
        self.filename = filename
@ -142,6 +151,7 @@ class BaseParser():
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
        table._bbox = self.table_bboxes()[table_idx]
        return table
    @staticmethod
@ -177,7 +187,7 @@ class BaseParser():
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()
@ -199,20 +209,23 @@ class BaseParser():
        _tables = []
        # sort tables based on y-coord
-        for table_idx, bbox in enumerate(
+        for table_idx, bbox in enumerate(self.table_bboxes()):
-                sorted(
+            if self.columns is not None and self.columns[table_idx] != "":
-                        self.table_bbox.keys(),
+                # user has to input boundary columns too
-                        key=lambda x: x[1],
+                # take (0, pdf_width) by default
-                        reverse=True
+                # similar to else condition
-                    )
+                # len can't be 1
-                ):
+                user_cols = self.columns[table_idx].split(",")
                user_cols = [float(c) for c in user_cols]
            else:
                user_cols = None
            cols, rows, v_s, h_s = self._generate_columns_and_rows(
                bbox,
-                table_idx
+                user_cols
            )
            table = self._generate_table(
                table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = bbox
            _tables.append(table)
        return _tables
@ -222,6 +235,7 @@ class BaseParser():
        """
        table.flavor = self.id
        table.filename = self.filename
        table.parse = self.table_bbox_parses[table._bbox]
        table.parse_details = self.parse_details
        pos_errors = self.compute_parse_errors(table)
        table.accuracy = compute_accuracy([[100, pos_errors]])
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        # for plotting
        table._bbox = self.table_bbox
        table._segments = None
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        self.record_parse_metadata(table)
        return table
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        # for plotting
        table._segments = None
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -0,0 +1,221 @@
 # -*- coding: utf-8 -*-
 from ..utils import (
    bboxes_overlap,
    boundaries_to_split_lines,
 )
 from .base import BaseParser
 from .network import Network
 from .lattice import Lattice
 class Hybrid(BaseParser):
    """Defines a hybrid parser, leveraging both network and lattice parsers.
    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    columns : list, optional (default: None)
        List of column x-coordinates strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    edge_tol : int, optional (default: 50)
        Tolerance parameter for extending textedges vertically.
    row_tol : int, optional (default: 2)
        Tolerance parameter used to combine text vertically,
        to generate rows.
    column_tol : int, optional (default: 0)
        Tolerance parameter used to combine text horizontally,
        to generate columns.
    """
    def __init__(
            self,
            table_regions=None,
            table_areas=None,
            columns=None,
            flag_size=False,
            split_text=False,
            strip_text="",
            edge_tol=None,
            row_tol=2,
            column_tol=0,
            debug=False,
            **kwargs):
        super().__init__(
            "hybrid",
            table_regions=table_regions,
            table_areas=table_areas,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            debug=debug,
        )
        self.network_parser = Network(
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
        self.lattice_parser = Lattice(
            table_regions=table_regions,
            table_areas=table_areas,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
        super().prepare_page_parse(filename, layout, dimensions,
                                   page_idx, layout_kwargs)
        self.network_parser.prepare_page_parse(
            filename, layout, dimensions, page_idx, layout_kwargs)
        self.lattice_parser.prepare_page_parse(
            filename, layout, dimensions, page_idx, layout_kwargs)
    def _generate_columns_and_rows(self, bbox, table_idx):
        parser = self.table_bbox_parses[bbox]
        return parser._generate_columns_and_rows(bbox, table_idx)
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        bbox = self.table_bboxes()[table_idx]
        parser = self.table_bbox_parses[bbox]
        return parser._generate_table(table_idx, cols, rows, **kwargs)
    @staticmethod
    def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
        """ Augment existing boundaries using provided hard splits.
        Boundaries:   |---|    |-| |---------|
        Splits:     |       |     |       |
        Augmented:  |-------|-----|-------|--|
        """
        idx_boundaries = len(boundaries) - 1
        idx_splits = len(splits) - 1
        previous_boundary = None
        while True:
            if idx_splits < 0:
                # No more splits to incorporate, we're done
                break
            split = splits[idx_splits]
            if idx_boundaries < 0:
                # Need to insert remaining splits
                new_boundary = [split, boundaries[0][0]]
                boundaries.insert(0, new_boundary)
                idx_splits = idx_splits - 1
            else:
                boundary = \
                    boundaries[idx_boundaries]
                if boundary[1] < \
                        split + tolerance:
                    # The lattice column is further to the right of our
                    # col boundary.  We move our left boundary to match.
                    boundary[1] = split
                    # And if there was another segment after, we make its
                    # right boundary match as well so that there's no gap
                    if previous_boundary is not None:
                        previous_boundary[0] = split
                    idx_splits = idx_splits - 1
                elif boundary[0] > \
                        split - tolerance:
                    # Our boundary is fully after the split, move on
                    idx_boundaries = idx_boundaries - 1
                    previous_boundary = boundary
                else:
                    # The split is inside our boundary: split it
                    new_boundary = [split, boundary[1]]
                    boundaries.insert(idx_boundaries + 1, new_boundary)
                    boundary[1] = split
                    previous_boundary = new_boundary
                    idx_splits = idx_splits - 1
        return boundaries
    def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
        """ Identify splits that were only detected by lattice or by network
        """
        lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
        lattice_cols, lattice_rows = \
            lattice_parse["col_anchors"], lattice_parse["row_anchors"]
        network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
        network_cols_boundaries = network_bbox_data["cols_boundaries"]
        # Favor hybrid, but complete or adjust its columns based on the
        # splits identified by lattice.
        if network_cols_boundaries is None:
            self.table_bbox_parses[lattice_bbox] = self.lattice_parser
        else:
            network_cols_boundaries = self._augment_boundaries_with_splits(
                network_cols_boundaries, lattice_cols)  # self.column_tol???
            augmented_bbox = (
                network_cols_boundaries[0][0], network_bbox[1],
                network_cols_boundaries[-1][1], network_bbox[3],
            )
            network_bbox_data["cols_anchors"] = \
                boundaries_to_split_lines(network_cols_boundaries)
            del self.network_parser.table_bbox_parses[network_bbox]
            self.network_parser.table_bbox_parses[augmented_bbox] = \
                network_bbox_data
            self.table_bbox_parses[augmented_bbox] = self.network_parser
    def _generate_table_bbox(self):
        # Collect bboxes from both parsers
        self.lattice_parser._generate_table_bbox()
        _lattice_bboxes = sorted(
                self.lattice_parser.table_bbox_parses,
                key=lambda bbox: (bbox[0], -bbox[1]))
        self.network_parser._generate_table_bbox()
        _network_bboxes = sorted(
                self.network_parser.table_bbox_parses,
                key=lambda bbox: (bbox[0], -bbox[1]))
        # Merge the data from both processes
        for lattice_bbox in _lattice_bboxes:
            merged = False
            for idx in range(len(_network_bboxes)-1, -1, -1):
                network_bbox = _network_bboxes[idx]
                if not bboxes_overlap(lattice_bbox, network_bbox):
                    continue
                self._merge_bbox_analysis(lattice_bbox, network_bbox)
                # network_bbox_data["cols_boundaries"]
                del _network_bboxes[idx]
                merged = True
            if not merged:
                self.table_bbox_parses[lattice_bbox] = self.lattice_parser
        # Add the bboxes from network that haven't been merged
        for network_bbox in _network_bboxes:
            self.table_bbox_parses[network_bbox] = self.network_parser
    def record_parse_metadata(self, table):
        super().record_parse_metadata(table)
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -2,8 +2,6 @@
 from __future__ import division
 import os
 import copy
 from .base import BaseParser
 from ..utils import (
@ -173,7 +171,6 @@ class Lattice(BaseParser):
        super().record_parse_metadata(table)
        # for plotting
        table._image = self.pdf_image  # Reuse the image used for calc
        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)
    def _generate_table_bbox(self):
@ -193,7 +190,7 @@ class Lattice(BaseParser):
            os.path.basename(self.filename),
            ".png"
        )
-        export_pdf_as_png(self.filename, self.image_path)
+        export_pdf_as_png(self.filename, self.image_path, self.resolution)
        self.pdf_image, self.threshold = adaptive_threshold(
            self.image_path,
            process_background=self.process_background,
@ -250,17 +247,59 @@ class Lattice(BaseParser):
            areas = scale_areas(self.table_areas)
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
        self.table_bbox_unscaled = copy.deepcopy(table_bbox)
        [
-            self.table_bbox,
+            self.table_bbox_parses,
            self.vertical_segments,
            self.horizontal_segments
        ] = scale_image(
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )
-    def _generate_columns_and_rows(self, bbox, table_idx):
+        for bbox, parse in self.table_bbox_parses.items():
            joints = parse["joints"]
            # Merge x coordinates that are close together
            line_tol = self.line_tol
            # Sort the joints, make them a list of lists (instead of sets)
            joints_normalized = list(
                map(
                    lambda x: list(x),
                    sorted(joints, key=lambda j: - j[0])
                )
            )
            for idx in range(1, len(joints_normalized)):
                x_left, x_right = \
                    joints_normalized[idx-1][0], joints_normalized[idx][0]
                if x_left - line_tol <= x_right <= x_left + line_tol:
                    joints_normalized[idx][0] = x_left
            # Merge y coordinates that are close together
            joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
            for idx in range(1, len(joints_normalized)):
                y_bottom, y_top = \
                    joints_normalized[idx-1][1], joints_normalized[idx][1]
                if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
                    joints_normalized[idx][1] = y_bottom
            # FRHTODO: check this is useful, otherwise get rid of the code
            # above
            parse["joints_normalized"] = joints_normalized
            cols = list(map(lambda coords: coords[0], joints))
            cols.extend([bbox[0], bbox[2]])
            rows = list(map(lambda coords: coords[1], joints))
            rows.extend([bbox[1], bbox[3]])
            # sort horizontal and vertical segments
            cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
            rows = merge_close_lines(
                sorted(rows, reverse=True),
                line_tol=self.line_tol
            )
            parse["col_anchors"] = cols
            parse["row_anchors"] = rows
    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        v_s, h_s = segments_in_bbox(
            bbox, self.vertical_segments, self.horizontal_segments
@ -270,21 +309,17 @@ class Lattice(BaseParser):
            self.horizontal_text,
            self.vertical_text
            )
        parse = self.table_bbox_parses[bbox]
        cols, rows = zip(*self.table_bbox[bbox])
        cols, rows = list(cols), list(rows)
        cols.extend([bbox[0], bbox[2]])
        rows.extend([bbox[1], bbox[3]])
        # sort horizontal and vertical segments
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
        rows = merge_close_lines(
            sorted(rows, reverse=True),
            line_tol=self.line_tol
        )
        # make grid using x and y coord of shortlisted rows and cols
-        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        cols = [
-        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
+            (parse["col_anchors"][i], parse["col_anchors"][i + 1])
-
+            for i in range(0, len(parse["col_anchors"]) - 1)
        ]
        rows = [
            (parse["row_anchors"][i], parse["row_anchors"][i + 1])
            for i in range(0, len(parse["row_anchors"]) - 1)
        ]
        return cols, rows, v_s, h_s
    def _generate_table(self, table_idx, cols, rows, **kwargs):
--- a/camelot/parsers/network.py
+++ b/camelot/parsers/network.py
@ -19,7 +19,8 @@ from ..utils import (
    text_in_bbox,
    textlines_overlapping_bbox,
    bbox_from_textlines,
-    find_columns_coordinates,
+    find_columns_boundaries,
    boundaries_to_split_lines,
    text_in_bbox_per_axis,
 )
@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
        tls_search_space.remove(most_aligned_tl)
        tls_in_bbox = [most_aligned_tl]
        last_bbox = None
-        last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
+        last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
        while last_bbox != bbox:
            if parse_details_search is not None:
                # Store debug info
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
                # of the new row won't reduce the number of columns.
                # This happens when text covers multiple rows - that's only
                # allowed in the header, treated separately.
-                cols_cand = find_columns_coordinates(tls_in_new_box)
+                cols_bounds = find_columns_boundaries(tls_in_new_box)
                if direction in ["bottom", "top"] and \
-                        len(cols_cand) < len(last_cols_cand):
+                        len(cols_bounds) < len(last_cols_bounds):
                    continue
                # We have an expansion candidate: register it, update the
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
                # We use bbox_from_textlines instead of cand_bbox in case some
                # overlapping textlines require a large bbox for strict fit.
                bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
-                last_cols_cand = cols_cand
+                last_cols_bounds = cols_bounds
                tls_in_bbox.extend(new_tls)
                for i in range(len(tls_search_space) - 1, -1, -1):
                    textline = tls_search_space[i]
@ -591,7 +592,7 @@ class Network(TextBaseParser):
        textlines = self._apply_regions_filter(all_textlines)
        textlines_processed = {}
-        self.table_bbox = {}
+        self.table_bbox_parses = {}
        if self.parse_details is not None:
            parse_details_network_searches = []
            self.parse_details["network_searches"] = \
@ -641,7 +642,8 @@ class Network(TextBaseParser):
            # Get all the textlines that overlap with the box, compute
            # columns
            tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
-            cols_anchors = find_columns_coordinates(tls_in_bbox)
+            cols_boundaries = find_columns_boundaries(tls_in_bbox)
            cols_anchors = boundaries_to_split_lines(cols_boundaries)
            # Unless the user gave us strict bbox_body, try to find a header
            # above the body to build the full bbox.
@ -662,10 +664,11 @@ class Network(TextBaseParser):
            table_parse = {
                "bbox_body": bbox_body,
                "cols_boundaries": cols_boundaries,
                "cols_anchors": cols_anchors,
                "bbox_full": bbox_full
            }
-            self.table_bbox[bbox_full] = table_parse
+            self.table_bbox_parses[bbox_full] = table_parse
            if self.parse_details is not None:
                self.parse_details["col_searches"].append(table_parse)
@ -678,7 +681,7 @@ class Network(TextBaseParser):
                textlines
            ))
-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
@ -706,18 +709,14 @@ class Network(TextBaseParser):
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
-        if self.columns is not None and self.columns[table_idx] != "":
+        if user_cols is not None:
-            # user has to input boundary columns too
+            cols = [text_x_min] + user_cols + [text_x_max]
-            # take (0, pdf_width) by default
+            cols = [
-            # similar to else condition
+                (cols[i], cols[i + 1])
-            # len can't be 1
+                for i in range(0, len(cols) - 1)
-            cols = self.columns[table_idx].split(",")
+            ]
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
-            parse_details = self.table_bbox[bbox]
+            parse_details = self.table_bbox_parses[bbox]
            col_anchors = parse_details["cols_anchors"]
            cols = list(map(
                lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -122,14 +122,14 @@ class Stream(TextBaseParser):
                        self.horizontal_text)
                    hor_text.extend(region_text)
            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(hor_text)
+            table_bbox_parses = self._nurminen_table_detection(hor_text)
        else:
-            table_bbox = {}
+            table_bbox_parses = {}
            for area_str in self.table_areas:
-                table_bbox[bbox_from_str(area_str)] = None
+                table_bbox_parses[bbox_from_str(area_str)] = None
-        self.table_bbox = table_bbox
+        self.table_bbox_parses = table_bbox_parses
-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
-        # FRHTODO:
+
        # This algorithm takes the horizontal textlines in the bbox, and groups
        # them into rows based on their bottom y0.
        # That's wrong: it misses the vertical items, and misses out on all
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
-        if self.columns is not None and self.columns[table_idx] != "":
+        if user_cols is not None:
-            # user has to input boundary columns too
+            cols = [text_x_min] + user_cols + [text_x_max]
-            # take (0, pdf_width) by default
+            cols = [
-            # similar to else condition
+                (cols[i], cols[i + 1])
-            # len can't be 1
+                for i in range(0, len(cols) - 1)
-            cols = self.columns[table_idx].split(",")
+            ]
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
-                        "No tables found in table area {}"
+                        "No tables found in table area {bbox}".format(
-                        .format(table_idx + 1)
+                            bbox=bbox)
                    )
            cols = [
                (t.x0, t.x1)
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -74,7 +74,7 @@ def draw_labeled_bbox(
    )
-def draw_pdf(table, ax, to_pdf_scale=True):
+def draw_pdf(table, ax):
    """Draw the content of the table's source pdf into the passed subplot
    Parameters
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
    ax : matplotlib.axes.Axes (optional)
    to_pdf_scale : bool (optional)
    """
    img = table.get_pdf_image()
    if to_pdf_scale:
    ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
    else:
        ax.imshow(img)
 def draw_parse_constraints(table, ax):
@ -132,8 +127,6 @@ def draw_text(table, ax):
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    ax : matplotlib.axes.Axes
    """
    bbox = bbox_from_textlines(table.textlines)
    for t in table.textlines:
@ -150,18 +143,14 @@ def draw_text(table, ax):
    extend_axe_lim(ax, bbox)
-def prepare_plot(table, ax=None, to_pdf_scale=True):
+def prepare_plot(table, ax=None):
    """Initialize plot and draw common components
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
    to_pdf_scale :
    ax : matplotlib.axes.Axes
    to_pdf_scale : bool (optional)
    Returns
    -------
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
-    draw_pdf(table, ax, to_pdf_scale)
+    draw_pdf(table, ax)
    draw_parse_constraints(table, ax)
    return ax
@ -186,7 +175,8 @@ class PlotMethods():
        table: camelot.core.Table
            A Camelot Table.
        kind : str, optional (default: 'text')
-            {'text', 'grid', 'contour', 'joint', 'line'}
+            {'text', 'grid', 'contour', 'joint', 'line',
                'network_table_search'}
            The element type for which a plot should be generated.
        filepath: str, optional (default: None)
            Absolute path for saving the generated plot.
@ -203,9 +193,12 @@ class PlotMethods():
            raise NotImplementedError(
                "Lattice flavor does not support kind='{}'".format(kind)
            )
-        if table.flavor in ["stream", "network"] and kind in ["line"]:
+        if table.flavor != "lattice" and kind in ["line"]:
            raise NotImplementedError(
-                "Stream flavor does not support kind='{}'".format(kind)
+                "{flavor} flavor does not support kind='{kind}'".format(
                    flavor=table.flavor,
                    kind=kind
                )
            )
        plot_method = getattr(self, kind)
@ -274,25 +267,21 @@ class PlotMethods():
        """
        _FOR_LATTICE = table.flavor == "lattice"
-        ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
+        ax = prepare_plot(table, ax)
        if _FOR_LATTICE:
            table_bbox = table._bbox_unscaled
        else:
            table_bbox = {table._bbox: None}
        if not _FOR_LATTICE:
            draw_text(table, ax)
        for t in table_bbox.keys():
        ax.add_patch(
            patches.Rectangle(
-                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
+                (table._bbox[0], table._bbox[1]),
                table._bbox[2] - table._bbox[0],
                table._bbox[3] - table._bbox[1],
                fill=False, color="red"
            )
        )
        if not _FOR_LATTICE:
-                extend_axe_lim(ax, t)
+            extend_axe_lim(ax, table._bbox)
        return ax.get_figure()
@ -393,12 +382,10 @@ class PlotMethods():
        fig : matplotlib.fig.Figure
        """
-        ax = prepare_plot(table, ax, to_pdf_scale=False)
+        ax = prepare_plot(table, ax)
        table_bbox = table._bbox_unscaled
        x_coord = []
        y_coord = []
-        for k in table_bbox.keys():
+        for coord in table.parse["joints"]:
            for coord in table_bbox[k]:
            x_coord.append(coord[0])
            y_coord.append(coord[1])
        ax.plot(x_coord, y_coord, "ro")
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
        j_x, j_y = zip(*tables[k])
        j_x = [scale(j, scaling_factor_x) for j in j_x]
        j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
-        joints = zip(j_x, j_y)
+        tables_new[(x1, y1, x2, y2)] = {
-        tables_new[(x1, y1, x2, y2)] = joints
+            "joints": list(zip(j_x, j_y))
        }
    v_segments_new = []
    for v in v_segments:
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
    )
 def bboxes_overlap(bbox1, bbox2):
    (left1, bottom1, right1, top1) = bbox1
    (left2, bottom2, right2, top2) = bbox2
    return (
            (left1 < left2 < right1) or (left1 < right2 < right1)
        ) and (
            (bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
        )
 def textlines_overlapping_bbox(bbox, textlines):
    """Returns all text objects which overlap or are within a bounding box.
@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
        List of PDFMiner text objects.
    """
    (left, bottom, right, top) = bbox
    t_bbox = [
        t
        for t in textlines
-        if ((left < t.x0 < right) or (left < t.x1 < right))
+        if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
        and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
    ]
    return t_bbox
@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
    return bbox
-def find_columns_coordinates(tls, min_gap=1.0):
+def find_columns_boundaries(tls, min_gap=1.0):
-    """Given a list of text objects, guess columns boundaries and returns a
+    """Make a list of disjunct cols boundaries for a list of text objects
    list of x-coordinates for split points between columns.
    Parameters
    ----------
    tls : list of PDFMiner text object.
-    min_gap : minimum distance between columns. Any elements closer than this
+    min_gap : minimum distance between columns. Any elements closer than
-        threshold are merged together.  This is to prevent spaces between words
+        this threshold are merged together.  This is to prevent spaces between
-        to be misinterpreted as column boundaries.
+        words to be misinterpreted as boundaries.
    Returns
    -------
-    cols_anchors : list
+    boundaries : list
-        List of x-coordinates for columns.
+        List x-coordinates for cols.
         [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
    """
    # Make a list of disjunct cols boundaries across the textlines
    # that comprise the table.
    # [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
    cols_bounds = []
    tls.sort(key=lambda tl: tl.x0)
    for tl in tls:
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
            cols_bounds.append([tl.x0, tl.x1])
        else:
            cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
    return cols_bounds
 def find_rows_boundaries(tls, min_gap=1.0):
    """Make a list of disjunct rows boundaries for a list of text objects
    Parameters
    ----------
    tls : list of PDFMiner text object.
    min_gap : minimum distance between rows. Any elements closer than
        this threshold are merged together.
    Returns
    -------
    boundaries : list
        List y-coordinates for rows.
         [(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
    """
    rows_bounds = []
    tls.sort(key=lambda tl: tl.y0)
    for tl in tls:
        if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
            rows_bounds.append([tl.y0, tl.y1])
        else:
            rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
    return rows_bounds
 def boundaries_to_split_lines(boundaries):
    """Find split lines given a list of boundaries between rows or cols.
    Boundaries:     [ a ]         [b]     [   c   ]  [d]
    Splits:         |        |         |            |  |
    Parameters
    ----------
    boundaries : list
        List of tuples of x- (for columns) or y- (for rows) coord boundaries.
        These are the (left, right most) or (bottom, top most) coordinates.
    Returns
    -------
    anchors : list
        List of coordinates representing the split points, each half way
        between boundaries
    """
    # From the row boundaries, identify splits by getting the mid points
    # between the boundaries.
-    # Row boundaries: [ a ]        [b]    [   c   ]
+    anchors = list(map(
-    # Splits:         |        |        |         |
+        lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
-    cols_anchors = list(map(
+        range(1, len(boundaries))
        lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
        range(1, len(cols_bounds))
    ))
-    cols_anchors.insert(0, cols_bounds[0][0])
+    anchors.insert(0, boundaries[0][0])
-    cols_anchors.append(cols_bounds[-1][1])
+    anchors.append(boundaries[-1][1])
-    return cols_anchors
+    return anchors
 def get_index_closest_point(point, sorted_list, fn=lambda x: x):
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
    return t
-def export_pdf_as_png(pdf_path, destination_path):
+def export_pdf_as_png(pdf_path, destination_path, resolution=300):
    """Generate an image from a pdf.
    Parameters
    ----------
    pdf_path : str
    destination_path : str
    resolution : int
    """
-    gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
+    gs_call = "-q -sDEVICE=png16m -o " \
        "{destination_path} -r{resolution} {pdf_path}" \
        .format(
            destination_path=destination_path,
            resolution=resolution,
            pdf_path=pdf_path
        )
    gs_call = gs_call.encode().split()
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
    ],
 ]
 # Compared to network, hybrid detects additional sparse columns
 data_hybrid_vertical_headers = [
    [
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "Congress-",
        "",
        "",
        "Senator 36th",
        "",
        "Rep106th",
        "",
        "Reg. of",
        "",
        "Road",
        "",
        "",
        "Distri",
        "Dist",
        "",
        "",
        "Dist",
    ],
    [
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "1st Dist",
        "Dist.",
        "",
        "",
        "Dist.",
        "Deeds",
        "",
        "Commission",
        "",
        "District #1",
        "",
        "ct #2",
        "#3",
        "Dist #4",
        "",
        "#5",
    ],
    [
        "",
        "",
        "",
        "",
        "",
        "Governor",
        "",
        "",
        "U.S. Senator",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "",
        "Number of Registered voters",
        "Poll Book Totals",
        "Brian Calley",
        "Patrick Colbeck",
        "Jim Hines",
        "Bill Schuette",
        "John James",
        "Sandy Pensler",
        "",
        "Jack Bergman",
        "",
        "Jim Stamas",
        "",
        "Sue Allor",
        "",
        "Melissa A. Cordes",
        "",
        "Al Scully",
        "",
        "Daniel G. Gauthier",
        "Craig M. Clemens",
        "Craig Johnston",
        "Carolyn Brummund",
        "Adam Brege",
        "David Bielusiak",
        "",
    ],
    [
        "Alcona",
        "963",
        "439",
        "55",
        "26",
        "47",
        "164",
        "173",
        "111",
        "",
        "268",
        "",
        "272",
        "",
        "275",
        "",
        "269",
        "",
        "271",
        "",
        "224",
        "76",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Caledonia",
        "923",
        "393",
        "40",
        "23",
        "45",
        "158",
        "150",
        "103",
        "",
        "244",
        "",
        "247",
        "",
        "254",
        "",
        "255",
        "",
        "244",
        "",
        "139",
        "143",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Curtis",
        "1026",
        "349",
        "30",
        "30",
        "25",
        "102",
        "95",
        "84",
        "",
        "159",
        "",
        "164",
        "",
        "162",
        "",
        "161",
        "",
        "157",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Greenbush",
        "1212",
        "423",
        "56",
        "26",
        "40",
        "126",
        "104",
        "131",
        "",
        "208",
        "",
        "213",
        "",
        "214",
        "",
        "215",
        "",
        "208",
        "",
        "",
        "",
        "",
        "208",
        "",
        "",
        "",
    ],
    [
        "Gustin",
        "611",
        "180",
        "22",
        "35",
        "17",
        "55",
        "73",
        "45",
        "",
        "108",
        "",
        "104",
        "",
        "111",
        "",
        "111",
        "",
        "109",
        "",
        "",
        "",
        "",
        "",
        "81",
        "42",
        "",
    ],
    [
        "Harrisville",
        "1142",
        "430",
        "45",
        "90",
        "29",
        "101",
        "155",
        "94",
        "",
        "226",
        "",
        "226",
        "",
        "232",
        "",
        "244",
        "",
        "226",
        "",
        "",
        "",
        "232",
        "",
        "",
        "",
        "",
    ],
    [
        "Hawes",
        "884",
        "293",
        "38",
        "36",
        "27",
        "109",
        "121",
        "84",
        "",
        "192",
        "",
        "195",
        "",
        "195",
        "",
        "193",
        "",
        "184",
        "",
        "",
        "",
        "",
        "",
        "118",
        "87",
        "",
    ],
    [
        "Haynes",
        "626",
        "275",
        "31",
        "20",
        "32",
        "104",
        "121",
        "53",
        "",
        "163",
        "",
        "163",
        "",
        "173",
        "",
        "161",
        "",
        "152",
        "",
        "",
        "",
        "76",
        "",
        "69",
        "31",
        "",
    ],
    [
        "Mikado",
        "781",
        "208",
        "19",
        "39",
        "17",
        "81",
        "90",
        "63",
        "",
        "149",
        "",
        "149",
        "",
        "145",
        "",
        "147",
        "",
        "143",
        "",
        "",
        "",
        "",
        "113",
        "",
        "",
        "",
    ],
    [
        "Millen",
        "353",
        "139",
        "7",
        "16",
        "13",
        "38",
        "49",
        "19",
        "",
        "62",
        "",
        "66",
        "",
        "67",
        "",
        "66",
        "",
        "62",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Mitchell",
        "327",
        "96",
        "12",
        "17",
        "7",
        "29",
        "41",
        "17",
        "",
        "57",
        "",
        "55",
        "",
        "57",
        "",
        "60",
        "",
        "56",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "City Harrisville",
        "389",
        "171",
        "16",
        "15",
        "18",
        "35",
        "49",
        "31",
        "",
        "78",
        "",
        "80",
        "",
        "82",
        "",
        "81",
        "",
        "77",
        "",
        "",
        "",
        "73",
        "",
        "",
        "",
        "",
    ],
    [
        "Totals",
        "9237",
        "3396",
        "371",
        "373",
        "317",
        "1102",
        "1221",
        "835",
        "0",
        "1914",
        "0",
        "1934",
        "",
        "1967",
        "",
        "1963",
        "0",
        "1889",
        "0",
        "363",
        "219",
        "381",
        "321",
        "268",
        "160",
        "0",
    ],
 ]
 data_stream_table_areas = [
--- a/tests/files/baseline_plots/test_joint_plot.png
+++ b/tests/files/baseline_plots/test_joint_plot.png
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -285,6 +285,19 @@ def test_network_layout_kwargs():
    assert_frame_equal(df, tables[0].df)
 # Hybrid parser
 def test_hybrid_vertical_header():
    """Tests a complex table with a vertically text header.
    """
    df = pd.DataFrame(data_hybrid_vertical_headers)
    filename = os.path.join(testdir, "vertical_header.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
 # Lattice parser tests
 def test_lattice():
    df = pd.DataFrame(data_lattice)