WIP: Introduce actual hybrid parser

Create hybrid parser leverage both lattice and network techniques. Simplify plotting of pdf in lattice case. Rename "parser.table_bbox" into "parser.table_bbox_parses", since it represents not a bbox but a dict of bbox to corresponding parsing data. Still missing: more unit tests, plotting of steps.
2020-05-04 16:27:01 -07:00 · 2020-05-04 16:27:01 -07:00 · 4a761611bf
parent edad1efd1b
commit 4a761611bf
17 changed files with 1011 additions and 217 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -396,7 +396,8 @@ def network(c, *args, **kwargs):
                "Please specify output file format using --format")

    tables = read_pdf(
-        filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
+        filepath, pages=pages, flavor="network",
+        suppress_stdout=quiet, **kwargs
    )
    click.echo("Found {} tables".format(tables.n))
    if plot_type is not None:
--- a/camelot/core.py
+++ b/camelot/core.py
@ -454,7 +454,9 @@ class Table():
        self.page = None
        self.flavor = None         # Flavor of the parser used
        self.pdf_size = None       # Dimensions of the original PDF page
-        self.parse_details = None  # Field holding debug data
+        self._bbox = None          # Bounding box in original document
+        self.parse = None          # Parse information
+        self.parse_details = None  # Field holding extra debug data

        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -7,7 +7,7 @@ import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter

 from .core import TableList
-from .parsers import Stream, Lattice, Network
+from .parsers import Stream, Lattice, Network, Hybrid
 from .utils import (
    build_file_path_in_temp_dir,
    get_page_layout,
@ -23,6 +23,7 @@ PARSERS = {
    "lattice": Lattice,
    "stream": Stream,
    "network": Network,
+    "hybrid": Hybrid,
 }


@ -177,7 +178,8 @@ class PDFHandler():
        Parameters
        ----------
        flavor : str (default: 'lattice')
-            The parsing method to use ('lattice', 'stream', or 'network').
+            The parsing method to use ('lattice', 'stream', 'network',
+            or 'hybrid').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -6,7 +6,9 @@ import cv2
 import numpy as np


-def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
+def adaptive_threshold(
+        imagename, process_background=False,
+        blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.

    Parameters
@ -19,12 +21,12 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.

-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.

-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa

    Returns
    -------
@ -39,7 +41,10 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):

    if process_background:
        threshold = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
+            gray,
+            255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, blocksize, c
        )
    else:
        threshold = cv2.adaptiveThreshold(
@ -54,7 +59,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):


 def find_lines(
-    threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
+    threshold, regions=None,
+    direction="horizontal", line_scale=15, iterations=0
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
@ -78,7 +84,7 @@ def find_lines(
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.

-        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.  # noqa

    Returns
    -------
@ -100,13 +106,15 @@ def find_lines(
        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
-        raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
+        raise ValueError(
+            "Specify direction as either 'vertical' or 'horizontal'"
+        )

    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
-            region_mask[y : y + h, x : x + w] = 1
+            region_mask[y:y + h, x:x + w] = 1
        threshold = np.multiply(threshold, region_mask)

    threshold = cv2.erode(threshold, el)
@ -115,12 +123,14 @@ def find_lines(

    try:
        _, contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
        )
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
        )

    for c in contours:
@ -202,7 +212,7 @@ def find_joints(contours, vertical, horizontal):
    tables = {}
    for c in contours:
        x, y, w, h = c
-        roi = joints[y : y + h, x : x + w]
+        roi = joints[y:y + h, x:x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
--- a/camelot/io.py
+++ b/camelot/io.py
@ -99,7 +99,7 @@ def read_pdf(

    """
    layout_kwargs = layout_kwargs or {}
-    if flavor not in ["lattice", "stream", "network"]:
+    if flavor not in ["lattice", "stream", "network", "hybrid"]:
        raise NotImplementedError(
            "Unknown flavor specified."
            " Use either 'lattice', 'stream', or 'network'"
--- a/camelot/parsers/init.py
+++ b/camelot/parsers/init.py
@ -3,3 +3,4 @@
 from .stream import Stream
 from .lattice import Lattice
 from .network import Network
+from .hybrid import Hybrid
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -34,8 +34,9 @@ class BaseParser():
        self.id = parser_id
        self.table_regions = table_regions
        self.table_areas = table_areas
-        self.table_bbox = {}
+        self.table_bbox_parses = {}

+        self.columns = None
        self.copy_text = copy_text
        self.split_text = split_text
        self.strip_text = strip_text
@ -47,10 +48,18 @@ class BaseParser():
        self.t_bbox = None

        # For plotting details of parsing algorithms
+        self.resolution = 300  # default plotting resolution of the PDF.
        self.parse_details = {}
        if not debug:
            self.parse_details = None

+    def table_bboxes(self):
+        return sorted(
+            self.table_bbox_parses.keys(),
+            key=lambda x: x[1],
+            reverse=True
+        )
+
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
        self.filename = filename
@ -142,6 +151,7 @@ class BaseParser():
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
+        table._bbox = self.table_bboxes()[table_idx]
        return table

    @staticmethod
@ -177,7 +187,7 @@ class BaseParser():
                        table.cells[r_idx][c_idx].text = text
        return pos_errors

-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # Pure virtual, must be defined by the derived parser
        raise NotImplementedError()

@ -199,20 +209,23 @@ class BaseParser():

        _tables = []
        # sort tables based on y-coord
-        for table_idx, bbox in enumerate(
-                sorted(
-                        self.table_bbox.keys(),
-                        key=lambda x: x[1],
-                        reverse=True
-                    )
-                ):
+        for table_idx, bbox in enumerate(self.table_bboxes()):
+            if self.columns is not None and self.columns[table_idx] != "":
+                # user has to input boundary columns too
+                # take (0, pdf_width) by default
+                # similar to else condition
+                # len can't be 1
+                user_cols = self.columns[table_idx].split(",")
+                user_cols = [float(c) for c in user_cols]
+            else:
+                user_cols = None
+
            cols, rows, v_s, h_s = self._generate_columns_and_rows(
                bbox,
-                table_idx
+                user_cols
            )
            table = self._generate_table(
                table_idx, cols, rows, v_s=v_s, h_s=h_s)
-            table._bbox = bbox
            _tables.append(table)

        return _tables
@ -222,6 +235,7 @@ class BaseParser():
        """
        table.flavor = self.id
        table.filename = self.filename
+        table.parse = self.table_bbox_parses[table._bbox]
        table.parse_details = self.parse_details
        pos_errors = self.compute_parse_errors(table)
        table.accuracy = compute_accuracy([[100, pos_errors]])
@ -453,17 +467,16 @@ class TextBaseParser(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")

-    def record_parse_metadata(self, table):
-        """Record data about the origin of the table
-        """
-        super().record_parse_metadata(table)
-        # for plotting
-        table._bbox = self.table_bbox
-        table._segments = None
-
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        self.record_parse_metadata(table)

        return table
+
+    def record_parse_metadata(self, table):
+        """Record data about the origin of the table
+        """
+        super().record_parse_metadata(table)
+        # for plotting
+        table._segments = None
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+from ..utils import (
+    bboxes_overlap,
+    boundaries_to_split_lines,
+)
+
+from .base import BaseParser
+from .network import Network
+from .lattice import Lattice
+
+
+class Hybrid(BaseParser):
+    """Defines a hybrid parser, leveraging both network and lattice parsers.
+
+    Parameters
+    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    table_areas : list, optional (default: None)
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    columns : list, optional (default: None)
+        List of column x-coordinates strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Split text that spans across multiple cells.
+    flag_size : bool, optional (default: False)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds <s></s> around flagged text.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
+    edge_tol : int, optional (default: 50)
+        Tolerance parameter for extending textedges vertically.
+    row_tol : int, optional (default: 2)
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
+    column_tol : int, optional (default: 0)
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
+
+    """
+
+    def __init__(
+            self,
+            table_regions=None,
+            table_areas=None,
+            columns=None,
+            flag_size=False,
+            split_text=False,
+            strip_text="",
+            edge_tol=None,
+            row_tol=2,
+            column_tol=0,
+            debug=False,
+            **kwargs):
+        super().__init__(
+            "hybrid",
+            table_regions=table_regions,
+            table_areas=table_areas,
+            flag_size=flag_size,
+            split_text=split_text,
+            strip_text=strip_text,
+            debug=debug,
+        )
+        self.network_parser = Network(
+            table_regions=table_regions,
+            table_areas=table_areas,
+            columns=columns,
+            flag_size=flag_size,
+            split_text=split_text,
+            strip_text=strip_text,
+            edge_tol=edge_tol,
+            row_tol=row_tol,
+            column_tol=column_tol,
+            debug=debug,
+        )
+        self.lattice_parser = Lattice(
+            table_regions=table_regions,
+            table_areas=table_areas,
+            flag_size=flag_size,
+            split_text=split_text,
+            strip_text=strip_text,
+            edge_tol=edge_tol,
+            row_tol=row_tol,
+            column_tol=column_tol,
+            debug=debug,
+        )
+
+    def prepare_page_parse(self, filename, layout, dimensions,
+                           page_idx, layout_kwargs):
+        super().prepare_page_parse(filename, layout, dimensions,
+                                   page_idx, layout_kwargs)
+        self.network_parser.prepare_page_parse(
+            filename, layout, dimensions, page_idx, layout_kwargs)
+        self.lattice_parser.prepare_page_parse(
+            filename, layout, dimensions, page_idx, layout_kwargs)
+
+    def _generate_columns_and_rows(self, bbox, table_idx):
+        parser = self.table_bbox_parses[bbox]
+        return parser._generate_columns_and_rows(bbox, table_idx)
+
+    def _generate_table(self, table_idx, cols, rows, **kwargs):
+        bbox = self.table_bboxes()[table_idx]
+        parser = self.table_bbox_parses[bbox]
+        return parser._generate_table(table_idx, cols, rows, **kwargs)
+
+    @staticmethod
+    def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
+        """ Augment existing boundaries using provided hard splits.
+
+        Boundaries:   |---|    |-| |---------|
+        Splits:     |       |     |       |
+        Augmented:  |-------|-----|-------|--|
+        """
+        idx_boundaries = len(boundaries) - 1
+        idx_splits = len(splits) - 1
+        previous_boundary = None
+        while True:
+            if idx_splits < 0:
+                # No more splits to incorporate, we're done
+                break
+            split = splits[idx_splits]
+
+            if idx_boundaries < 0:
+                # Need to insert remaining splits
+                new_boundary = [split, boundaries[0][0]]
+                boundaries.insert(0, new_boundary)
+                idx_splits = idx_splits - 1
+            else:
+                boundary = \
+                    boundaries[idx_boundaries]
+                if boundary[1] < \
+                        split + tolerance:
+                    # The lattice column is further to the right of our
+                    # col boundary.  We move our left boundary to match.
+                    boundary[1] = split
+                    # And if there was another segment after, we make its
+                    # right boundary match as well so that there's no gap
+                    if previous_boundary is not None:
+                        previous_boundary[0] = split
+                    idx_splits = idx_splits - 1
+                elif boundary[0] > \
+                        split - tolerance:
+                    # Our boundary is fully after the split, move on
+                    idx_boundaries = idx_boundaries - 1
+                    previous_boundary = boundary
+                else:
+                    # The split is inside our boundary: split it
+                    new_boundary = [split, boundary[1]]
+                    boundaries.insert(idx_boundaries + 1, new_boundary)
+                    boundary[1] = split
+                    previous_boundary = new_boundary
+                    idx_splits = idx_splits - 1
+        return boundaries
+
+    def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
+        """ Identify splits that were only detected by lattice or by network
+        """
+        lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
+        lattice_cols, lattice_rows = \
+            lattice_parse["col_anchors"], lattice_parse["row_anchors"]
+
+        network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
+        network_cols_boundaries = network_bbox_data["cols_boundaries"]
+
+        # Favor hybrid, but complete or adjust its columns based on the
+        # splits identified by lattice.
+        if network_cols_boundaries is None:
+            self.table_bbox_parses[lattice_bbox] = self.lattice_parser
+        else:
+            network_cols_boundaries = self._augment_boundaries_with_splits(
+                network_cols_boundaries, lattice_cols)  # self.column_tol???
+            augmented_bbox = (
+                network_cols_boundaries[0][0], network_bbox[1],
+                network_cols_boundaries[-1][1], network_bbox[3],
+            )
+            network_bbox_data["cols_anchors"] = \
+                boundaries_to_split_lines(network_cols_boundaries)
+
+            del self.network_parser.table_bbox_parses[network_bbox]
+            self.network_parser.table_bbox_parses[augmented_bbox] = \
+                network_bbox_data
+            self.table_bbox_parses[augmented_bbox] = self.network_parser
+
+    def _generate_table_bbox(self):
+        # Collect bboxes from both parsers
+        self.lattice_parser._generate_table_bbox()
+        _lattice_bboxes = sorted(
+                self.lattice_parser.table_bbox_parses,
+                key=lambda bbox: (bbox[0], -bbox[1]))
+        self.network_parser._generate_table_bbox()
+        _network_bboxes = sorted(
+                self.network_parser.table_bbox_parses,
+                key=lambda bbox: (bbox[0], -bbox[1]))
+
+        # Merge the data from both processes
+        for lattice_bbox in _lattice_bboxes:
+            merged = False
+
+            for idx in range(len(_network_bboxes)-1, -1, -1):
+                network_bbox = _network_bboxes[idx]
+                if not bboxes_overlap(lattice_bbox, network_bbox):
+                    continue
+                self._merge_bbox_analysis(lattice_bbox, network_bbox)
+                # network_bbox_data["cols_boundaries"]
+                del _network_bboxes[idx]
+                merged = True
+            if not merged:
+                self.table_bbox_parses[lattice_bbox] = self.lattice_parser
+
+        # Add the bboxes from network that haven't been merged
+        for network_bbox in _network_bboxes:
+            self.table_bbox_parses[network_bbox] = self.network_parser
+
+    def record_parse_metadata(self, table):
+        super().record_parse_metadata(table)
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -2,8 +2,6 @@

 from __future__ import division
 import os
-import copy
-

 from .base import BaseParser
 from ..utils import (
@ -173,7 +171,6 @@ class Lattice(BaseParser):
        super().record_parse_metadata(table)
        # for plotting
        table._image = self.pdf_image  # Reuse the image used for calc
-        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)

    def _generate_table_bbox(self):
@ -193,7 +190,7 @@ class Lattice(BaseParser):
            os.path.basename(self.filename),
            ".png"
        )
-        export_pdf_as_png(self.filename, self.image_path)
+        export_pdf_as_png(self.filename, self.image_path, self.resolution)
        self.pdf_image, self.threshold = adaptive_threshold(
            self.image_path,
            process_background=self.process_background,
@ -250,17 +247,59 @@ class Lattice(BaseParser):
            areas = scale_areas(self.table_areas)
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)

-        self.table_bbox_unscaled = copy.deepcopy(table_bbox)
-
        [
-            self.table_bbox,
+            self.table_bbox_parses,
            self.vertical_segments,
            self.horizontal_segments
        ] = scale_image(
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )

-    def _generate_columns_and_rows(self, bbox, table_idx):
+        for bbox, parse in self.table_bbox_parses.items():
+            joints = parse["joints"]
+
+            # Merge x coordinates that are close together
+            line_tol = self.line_tol
+            # Sort the joints, make them a list of lists (instead of sets)
+            joints_normalized = list(
+                map(
+                    lambda x: list(x),
+                    sorted(joints, key=lambda j: - j[0])
+                )
+            )
+            for idx in range(1, len(joints_normalized)):
+                x_left, x_right = \
+                    joints_normalized[idx-1][0], joints_normalized[idx][0]
+                if x_left - line_tol <= x_right <= x_left + line_tol:
+                    joints_normalized[idx][0] = x_left
+
+            # Merge y coordinates that are close together
+            joints_normalized = sorted(joints_normalized, key=lambda j: -j[1])
+            for idx in range(1, len(joints_normalized)):
+                y_bottom, y_top = \
+                    joints_normalized[idx-1][1], joints_normalized[idx][1]
+                if y_bottom - line_tol <= y_top <= y_bottom + line_tol:
+                    joints_normalized[idx][1] = y_bottom
+
+            # FRHTODO: check this is useful, otherwise get rid of the code
+            # above
+            parse["joints_normalized"] = joints_normalized
+
+            cols = list(map(lambda coords: coords[0], joints))
+            cols.extend([bbox[0], bbox[2]])
+            rows = list(map(lambda coords: coords[1], joints))
+            rows.extend([bbox[1], bbox[3]])
+
+            # sort horizontal and vertical segments
+            cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
+            rows = merge_close_lines(
+                sorted(rows, reverse=True),
+                line_tol=self.line_tol
+            )
+            parse["col_anchors"] = cols
+            parse["row_anchors"] = rows
+
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        v_s, h_s = segments_in_bbox(
            bbox, self.vertical_segments, self.horizontal_segments
@ -270,21 +309,17 @@ class Lattice(BaseParser):
            self.horizontal_text,
            self.vertical_text
            )
+        parse = self.table_bbox_parses[bbox]

-        cols, rows = zip(*self.table_bbox[bbox])
-        cols, rows = list(cols), list(rows)
-        cols.extend([bbox[0], bbox[2]])
-        rows.extend([bbox[1], bbox[3]])
-        # sort horizontal and vertical segments
-        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
-        rows = merge_close_lines(
-            sorted(rows, reverse=True),
-            line_tol=self.line_tol
-        )
        # make grid using x and y coord of shortlisted rows and cols
-        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
-        rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
-
+        cols = [
+            (parse["col_anchors"][i], parse["col_anchors"][i + 1])
+            for i in range(0, len(parse["col_anchors"]) - 1)
+        ]
+        rows = [
+            (parse["row_anchors"][i], parse["row_anchors"][i + 1])
+            for i in range(0, len(parse["row_anchors"]) - 1)
+        ]
        return cols, rows, v_s, h_s

    def _generate_table(self, table_idx, cols, rows, **kwargs):
--- a/camelot/parsers/network.py
+++ b/camelot/parsers/network.py
@ -19,7 +19,8 @@ from ..utils import (
    text_in_bbox,
    textlines_overlapping_bbox,
    bbox_from_textlines,
-    find_columns_coordinates,
+    find_columns_boundaries,
+    boundaries_to_split_lines,
    text_in_bbox_per_axis,
 )

@ -438,7 +439,7 @@ class TextNetworks(TextAlignments):
        tls_search_space.remove(most_aligned_tl)
        tls_in_bbox = [most_aligned_tl]
        last_bbox = None
-        last_cols_cand = [most_aligned_tl.x0, most_aligned_tl.x1]
+        last_cols_bounds = [(most_aligned_tl.x0, most_aligned_tl.x1)]
        while last_bbox != bbox:
            if parse_details_search is not None:
                # Store debug info
@ -479,9 +480,9 @@ class TextNetworks(TextAlignments):
                # of the new row won't reduce the number of columns.
                # This happens when text covers multiple rows - that's only
                # allowed in the header, treated separately.
-                cols_cand = find_columns_coordinates(tls_in_new_box)
+                cols_bounds = find_columns_boundaries(tls_in_new_box)
                if direction in ["bottom", "top"] and \
-                        len(cols_cand) < len(last_cols_cand):
+                        len(cols_bounds) < len(last_cols_bounds):
                    continue

                # We have an expansion candidate: register it, update the
@ -489,7 +490,7 @@ class TextNetworks(TextAlignments):
                # We use bbox_from_textlines instead of cand_bbox in case some
                # overlapping textlines require a large bbox for strict fit.
                bbox = cand_bbox = list(bbox_from_textlines(tls_in_new_box))
-                last_cols_cand = cols_cand
+                last_cols_bounds = cols_bounds
                tls_in_bbox.extend(new_tls)
                for i in range(len(tls_search_space) - 1, -1, -1):
                    textline = tls_search_space[i]
@ -591,7 +592,7 @@ class Network(TextBaseParser):
        textlines = self._apply_regions_filter(all_textlines)

        textlines_processed = {}
-        self.table_bbox = {}
+        self.table_bbox_parses = {}
        if self.parse_details is not None:
            parse_details_network_searches = []
            self.parse_details["network_searches"] = \
@ -641,7 +642,8 @@ class Network(TextBaseParser):
            # Get all the textlines that overlap with the box, compute
            # columns
            tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
-            cols_anchors = find_columns_coordinates(tls_in_bbox)
+            cols_boundaries = find_columns_boundaries(tls_in_bbox)
+            cols_anchors = boundaries_to_split_lines(cols_boundaries)

            # Unless the user gave us strict bbox_body, try to find a header
            # above the body to build the full bbox.
@ -662,10 +664,11 @@ class Network(TextBaseParser):

            table_parse = {
                "bbox_body": bbox_body,
+                "cols_boundaries": cols_boundaries,
                "cols_anchors": cols_anchors,
                "bbox_full": bbox_full
            }
-            self.table_bbox[bbox_full] = table_parse
+            self.table_bbox_parses[bbox_full] = table_parse

            if self.parse_details is not None:
                self.parse_details["col_searches"].append(table_parse)
@ -678,7 +681,7 @@ class Network(TextBaseParser):
                textlines
            ))

-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
@ -706,18 +709,14 @@ class Network(TextBaseParser):
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)

-        if self.columns is not None and self.columns[table_idx] != "":
-            # user has to input boundary columns too
-            # take (0, pdf_width) by default
-            # similar to else condition
-            # len can't be 1
-            cols = self.columns[table_idx].split(",")
-            cols = [float(c) for c in cols]
-            cols.insert(0, text_x_min)
-            cols.append(text_x_max)
-            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        if user_cols is not None:
+            cols = [text_x_min] + user_cols + [text_x_max]
+            cols = [
+                (cols[i], cols[i + 1])
+                for i in range(0, len(cols) - 1)
+            ]
        else:
-            parse_details = self.table_bbox[bbox]
+            parse_details = self.table_bbox_parses[bbox]
            col_anchors = parse_details["cols_anchors"]
            cols = list(map(
                lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -122,14 +122,14 @@ class Stream(TextBaseParser):
                        self.horizontal_text)
                    hor_text.extend(region_text)
            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(hor_text)
+            table_bbox_parses = self._nurminen_table_detection(hor_text)
        else:
-            table_bbox = {}
+            table_bbox_parses = {}
            for area_str in self.table_areas:
-                table_bbox[bbox_from_str(area_str)] = None
-        self.table_bbox = table_bbox
+                table_bbox_parses[bbox_from_str(area_str)] = None
+        self.table_bbox_parses = table_bbox_parses

-    def _generate_columns_and_rows(self, bbox, table_idx):
+    def _generate_columns_and_rows(self, bbox, user_cols):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
@ -140,26 +140,18 @@ class Stream(TextBaseParser):
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
-        # FRHTODO:
-        # This algorithm takes the horizontal textlines in the bbox, and groups
-        # them into rows based on their bottom y0.
-        # That's wrong: it misses the vertical items, and misses out on all
-        # the alignment identification work we've done earlier.
+
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]

-        if self.columns is not None and self.columns[table_idx] != "":
-            # user has to input boundary columns too
-            # take (0, pdf_width) by default
-            # similar to else condition
-            # len can't be 1
-            cols = self.columns[table_idx].split(",")
-            cols = [float(c) for c in cols]
-            cols.insert(0, text_x_min)
-            cols.append(text_x_max)
-            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        if user_cols is not None:
+            cols = [text_x_min] + user_cols + [text_x_max]
+            cols = [
+                (cols[i], cols[i + 1])
+                for i in range(0, len(cols) - 1)
+            ]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
@ -175,8 +167,8 @@ class Stream(TextBaseParser):
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
-                        "No tables found in table area {}"
-                        .format(table_idx + 1)
+                        "No tables found in table area {bbox}".format(
+                            bbox=bbox)
                    )
            cols = [
                (t.x0, t.x1)
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -74,7 +74,7 @@ def draw_labeled_bbox(
    )


-def draw_pdf(table, ax, to_pdf_scale=True):
+def draw_pdf(table, ax):
    """Draw the content of the table's source pdf into the passed subplot

    Parameters
@ -83,14 +83,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):

    ax : matplotlib.axes.Axes (optional)

-    to_pdf_scale : bool (optional)
-
    """
    img = table.get_pdf_image()
-    if to_pdf_scale:
-        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
-    else:
-        ax.imshow(img)
+    ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))


 def draw_parse_constraints(table, ax):
@ -132,8 +127,6 @@ def draw_text(table, ax):
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)

-    ax : matplotlib.axes.Axes
-
    """
    bbox = bbox_from_textlines(table.textlines)
    for t in table.textlines:
@ -150,18 +143,14 @@ def draw_text(table, ax):
    extend_axe_lim(ax, bbox)


-def prepare_plot(table, ax=None, to_pdf_scale=True):
+def prepare_plot(table, ax=None):
    """Initialize plot and draw common components

    Parameters
    ----------
    table : camelot.core.Table
+
    ax : matplotlib.axes.Axes (optional)
-    to_pdf_scale :
-
-    ax : matplotlib.axes.Axes
-
-    to_pdf_scale : bool (optional)

    Returns
    -------
@ -170,7 +159,7 @@ def prepare_plot(table, ax=None, to_pdf_scale=True):
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
-    draw_pdf(table, ax, to_pdf_scale)
+    draw_pdf(table, ax)
    draw_parse_constraints(table, ax)
    return ax

@ -186,7 +175,8 @@ class PlotMethods():
        table: camelot.core.Table
            A Camelot Table.
        kind : str, optional (default: 'text')
-            {'text', 'grid', 'contour', 'joint', 'line'}
+            {'text', 'grid', 'contour', 'joint', 'line',
+                'network_table_search'}
            The element type for which a plot should be generated.
        filepath: str, optional (default: None)
            Absolute path for saving the generated plot.
@ -203,9 +193,12 @@ class PlotMethods():
            raise NotImplementedError(
                "Lattice flavor does not support kind='{}'".format(kind)
            )
-        if table.flavor in ["stream", "network"] and kind in ["line"]:
+        if table.flavor != "lattice" and kind in ["line"]:
            raise NotImplementedError(
-                "Stream flavor does not support kind='{}'".format(kind)
+                "{flavor} flavor does not support kind='{kind}'".format(
+                    flavor=table.flavor,
+                    kind=kind
+                )
            )

        plot_method = getattr(self, kind)
@ -274,25 +267,21 @@ class PlotMethods():

        """
        _FOR_LATTICE = table.flavor == "lattice"
-        ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
-
-        if _FOR_LATTICE:
-            table_bbox = table._bbox_unscaled
-        else:
-            table_bbox = {table._bbox: None}
+        ax = prepare_plot(table, ax)

        if not _FOR_LATTICE:
            draw_text(table, ax)

-        for t in table_bbox.keys():
-            ax.add_patch(
-                patches.Rectangle(
-                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
-                    fill=False, color="red"
-                )
+        ax.add_patch(
+            patches.Rectangle(
+                (table._bbox[0], table._bbox[1]),
+                table._bbox[2] - table._bbox[0],
+                table._bbox[3] - table._bbox[1],
+                fill=False, color="red"
            )
-            if not _FOR_LATTICE:
-                extend_axe_lim(ax, t)
+        )
+        if not _FOR_LATTICE:
+            extend_axe_lim(ax, table._bbox)

        return ax.get_figure()

@ -393,14 +382,12 @@ class PlotMethods():
        fig : matplotlib.fig.Figure

        """
-        ax = prepare_plot(table, ax, to_pdf_scale=False)
-        table_bbox = table._bbox_unscaled
+        ax = prepare_plot(table, ax)
        x_coord = []
        y_coord = []
-        for k in table_bbox.keys():
-            for coord in table_bbox[k]:
-                x_coord.append(coord[0])
-                y_coord.append(coord[1])
+        for coord in table.parse["joints"]:
+            x_coord.append(coord[0])
+            y_coord.append(coord[1])
        ax.plot(x_coord, y_coord, "ro")
        return ax.get_figure()

--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -297,8 +297,9 @@ def scale_image(tables, v_segments, h_segments, factors):
        j_x, j_y = zip(*tables[k])
        j_x = [scale(j, scaling_factor_x) for j in j_x]
        j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
-        joints = zip(j_x, j_y)
-        tables_new[(x1, y1, x2, y2)] = joints
+        tables_new[(x1, y1, x2, y2)] = {
+            "joints": list(zip(j_x, j_y))
+        }

    v_segments_new = []
    for v in v_segments:
@ -434,6 +435,16 @@ def bbox_from_str(bbox_str):
    )


+def bboxes_overlap(bbox1, bbox2):
+    (left1, bottom1, right1, top1) = bbox1
+    (left2, bottom2, right2, top2) = bbox2
+    return (
+            (left1 < left2 < right1) or (left1 < right2 < right1)
+        ) and (
+            (bottom1 < bottom2 < top1) or (bottom1 < top2 < top1)
+        )
+
+
 def textlines_overlapping_bbox(bbox, textlines):
    """Returns all text objects which overlap or are within a bounding box.

@ -451,12 +462,10 @@ def textlines_overlapping_bbox(bbox, textlines):
        List of PDFMiner text objects.

    """
-    (left, bottom, right, top) = bbox
    t_bbox = [
        t
        for t in textlines
-        if ((left < t.x0 < right) or (left < t.x1 < right))
-        and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
+        if bboxes_overlap(bbox, (t.x0, t.y0, t.x1, t.y1))
    ]
    return t_bbox

@ -560,27 +569,25 @@ def bbox_from_textlines(textlines):
    return bbox


-def find_columns_coordinates(tls, min_gap=1.0):
-    """Given a list of text objects, guess columns boundaries and returns a
-    list of x-coordinates for split points between columns.
+def find_columns_boundaries(tls, min_gap=1.0):
+    """Make a list of disjunct cols boundaries for a list of text objects

    Parameters
    ----------
    tls : list of PDFMiner text object.

-    min_gap : minimum distance between columns. Any elements closer than this
-        threshold are merged together.  This is to prevent spaces between words
-        to be misinterpreted as column boundaries.
+    min_gap : minimum distance between columns. Any elements closer than
+        this threshold are merged together.  This is to prevent spaces between
+        words to be misinterpreted as boundaries.

    Returns
    -------
-    cols_anchors : list
-        List of x-coordinates for columns.
+    boundaries : list
+        List x-coordinates for cols.
+         [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
+

    """
-    # Make a list of disjunct cols boundaries across the textlines
-    # that comprise the table.
-    # [(1st col left, 1st col right), (2nd col left, 2nd col right), ...]
    cols_bounds = []
    tls.sort(key=lambda tl: tl.x0)
    for tl in tls:
@ -588,18 +595,64 @@ def find_columns_coordinates(tls, min_gap=1.0):
            cols_bounds.append([tl.x0, tl.x1])
        else:
            cols_bounds[-1][1] = max(cols_bounds[-1][1], tl.x1)
+    return cols_bounds

+
+def find_rows_boundaries(tls, min_gap=1.0):
+    """Make a list of disjunct rows boundaries for a list of text objects
+
+    Parameters
+    ----------
+    tls : list of PDFMiner text object.
+
+    min_gap : minimum distance between rows. Any elements closer than
+        this threshold are merged together.
+
+    Returns
+    -------
+    boundaries : list
+        List y-coordinates for rows.
+         [(1st row bottom, 1st row top), (2nd row bottom, 2nd row top), ...]
+
+    """
+    rows_bounds = []
+    tls.sort(key=lambda tl: tl.y0)
+    for tl in tls:
+        if (not rows_bounds) or rows_bounds[-1][1] + min_gap < tl.y0:
+            rows_bounds.append([tl.y0, tl.y1])
+        else:
+            rows_bounds[-1][1] = max(rows_bounds[-1][1], tl.y1)
+    return rows_bounds
+
+
+def boundaries_to_split_lines(boundaries):
+    """Find split lines given a list of boundaries between rows or cols.
+
+    Boundaries:     [ a ]         [b]     [   c   ]  [d]
+    Splits:         |        |         |            |  |
+
+    Parameters
+    ----------
+    boundaries : list
+        List of tuples of x- (for columns) or y- (for rows) coord boundaries.
+        These are the (left, right most) or (bottom, top most) coordinates.
+
+    Returns
+    -------
+    anchors : list
+        List of coordinates representing the split points, each half way
+        between boundaries
+
+    """
    # From the row boundaries, identify splits by getting the mid points
    # between the boundaries.
-    # Row boundaries: [ a ]        [b]    [   c   ]
-    # Splits:         |        |        |         |
-    cols_anchors = list(map(
-        lambda idx: (cols_bounds[idx-1][1] + cols_bounds[idx][0]) / 2.0,
-        range(1, len(cols_bounds))
+    anchors = list(map(
+        lambda idx: (boundaries[idx-1][1] + boundaries[idx][0]) / 2.0,
+        range(1, len(boundaries))
    ))
-    cols_anchors.insert(0, cols_bounds[0][0])
-    cols_anchors.append(cols_bounds[-1][1])
-    return cols_anchors
+    anchors.insert(0, boundaries[0][0])
+    anchors.append(boundaries[-1][1])
+    return anchors


 def get_index_closest_point(point, sorted_list, fn=lambda x: x):
@ -1129,17 +1182,20 @@ def get_text_objects(layout, ltype="char", t=None):
    return t


-def export_pdf_as_png(pdf_path, destination_path):
+def export_pdf_as_png(pdf_path, destination_path, resolution=300):
    """Generate an image from a pdf.

    Parameters
    ----------
    pdf_path : str
    destination_path : str
+    resolution : int
    """
-    gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
+    gs_call = "-q -sDEVICE=png16m -o " \
+        "{destination_path} -r{resolution} {pdf_path}" \
        .format(
            destination_path=destination_path,
+            resolution=resolution,
            pdf_path=pdf_path
        )
    gs_call = gs_call.encode().split()
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -2074,6 +2074,502 @@ data_network_vertical_headers = [
    ],
 ]

+# Compared to network, hybrid detects additional sparse columns
+data_hybrid_vertical_headers = [
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "Congress-",
+        "",
+        "",
+        "Senator 36th",
+        "",
+        "Rep106th",
+        "",
+        "Reg. of",
+        "",
+        "Road",
+        "",
+        "",
+        "Distri",
+        "Dist",
+        "",
+        "",
+        "Dist",
+    ],
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "1st Dist",
+        "Dist.",
+        "",
+        "",
+        "Dist.",
+        "Deeds",
+        "",
+        "Commission",
+        "",
+        "District #1",
+        "",
+        "ct #2",
+        "#3",
+        "Dist #4",
+        "",
+        "#5",
+    ],
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "Governor",
+        "",
+        "",
+        "U.S. Senator",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "",
+        "Number of Registered voters",
+        "Poll Book Totals",
+        "Brian Calley",
+        "Patrick Colbeck",
+        "Jim Hines",
+        "Bill Schuette",
+        "John James",
+        "Sandy Pensler",
+        "",
+        "Jack Bergman",
+        "",
+        "Jim Stamas",
+        "",
+        "Sue Allor",
+        "",
+        "Melissa A. Cordes",
+        "",
+        "Al Scully",
+        "",
+        "Daniel G. Gauthier",
+        "Craig M. Clemens",
+        "Craig Johnston",
+        "Carolyn Brummund",
+        "Adam Brege",
+        "David Bielusiak",
+        "",
+    ],
+    [
+        "Alcona",
+        "963",
+        "439",
+        "55",
+        "26",
+        "47",
+        "164",
+        "173",
+        "111",
+        "",
+        "268",
+        "",
+        "272",
+        "",
+        "275",
+        "",
+        "269",
+        "",
+        "271",
+        "",
+        "224",
+        "76",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Caledonia",
+        "923",
+        "393",
+        "40",
+        "23",
+        "45",
+        "158",
+        "150",
+        "103",
+        "",
+        "244",
+        "",
+        "247",
+        "",
+        "254",
+        "",
+        "255",
+        "",
+        "244",
+        "",
+        "139",
+        "143",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Curtis",
+        "1026",
+        "349",
+        "30",
+        "30",
+        "25",
+        "102",
+        "95",
+        "84",
+        "",
+        "159",
+        "",
+        "164",
+        "",
+        "162",
+        "",
+        "161",
+        "",
+        "157",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Greenbush",
+        "1212",
+        "423",
+        "56",
+        "26",
+        "40",
+        "126",
+        "104",
+        "131",
+        "",
+        "208",
+        "",
+        "213",
+        "",
+        "214",
+        "",
+        "215",
+        "",
+        "208",
+        "",
+        "",
+        "",
+        "",
+        "208",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Gustin",
+        "611",
+        "180",
+        "22",
+        "35",
+        "17",
+        "55",
+        "73",
+        "45",
+        "",
+        "108",
+        "",
+        "104",
+        "",
+        "111",
+        "",
+        "111",
+        "",
+        "109",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "81",
+        "42",
+        "",
+    ],
+    [
+        "Harrisville",
+        "1142",
+        "430",
+        "45",
+        "90",
+        "29",
+        "101",
+        "155",
+        "94",
+        "",
+        "226",
+        "",
+        "226",
+        "",
+        "232",
+        "",
+        "244",
+        "",
+        "226",
+        "",
+        "",
+        "",
+        "232",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Hawes",
+        "884",
+        "293",
+        "38",
+        "36",
+        "27",
+        "109",
+        "121",
+        "84",
+        "",
+        "192",
+        "",
+        "195",
+        "",
+        "195",
+        "",
+        "193",
+        "",
+        "184",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "118",
+        "87",
+        "",
+    ],
+    [
+        "Haynes",
+        "626",
+        "275",
+        "31",
+        "20",
+        "32",
+        "104",
+        "121",
+        "53",
+        "",
+        "163",
+        "",
+        "163",
+        "",
+        "173",
+        "",
+        "161",
+        "",
+        "152",
+        "",
+        "",
+        "",
+        "76",
+        "",
+        "69",
+        "31",
+        "",
+    ],
+    [
+        "Mikado",
+        "781",
+        "208",
+        "19",
+        "39",
+        "17",
+        "81",
+        "90",
+        "63",
+        "",
+        "149",
+        "",
+        "149",
+        "",
+        "145",
+        "",
+        "147",
+        "",
+        "143",
+        "",
+        "",
+        "",
+        "",
+        "113",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Millen",
+        "353",
+        "139",
+        "7",
+        "16",
+        "13",
+        "38",
+        "49",
+        "19",
+        "",
+        "62",
+        "",
+        "66",
+        "",
+        "67",
+        "",
+        "66",
+        "",
+        "62",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Mitchell",
+        "327",
+        "96",
+        "12",
+        "17",
+        "7",
+        "29",
+        "41",
+        "17",
+        "",
+        "57",
+        "",
+        "55",
+        "",
+        "57",
+        "",
+        "60",
+        "",
+        "56",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "City Harrisville",
+        "389",
+        "171",
+        "16",
+        "15",
+        "18",
+        "35",
+        "49",
+        "31",
+        "",
+        "78",
+        "",
+        "80",
+        "",
+        "82",
+        "",
+        "81",
+        "",
+        "77",
+        "",
+        "",
+        "",
+        "73",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Totals",
+        "9237",
+        "3396",
+        "371",
+        "373",
+        "317",
+        "1102",
+        "1221",
+        "835",
+        "0",
+        "1914",
+        "0",
+        "1934",
+        "",
+        "1967",
+        "",
+        "1963",
+        "0",
+        "1889",
+        "0",
+        "363",
+        "219",
+        "381",
+        "321",
+        "268",
+        "160",
+        "0",
+    ],
+]


 data_stream_table_areas = [
--- a/tests/files/baseline_plots/test_joint_plot.png
+++ b/tests/files/baseline_plots/test_joint_plot.png
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -291,6 +291,19 @@ def test_network_layout_kwargs():
    assert_frame_equal(df, tables[0].df)


+# Hybrid parser
+def test_hybrid_vertical_header():
+    """Tests a complex table with a vertically text header.
+    """
+    df = pd.DataFrame(data_hybrid_vertical_headers)
+
+    filename = os.path.join(testdir, "vertical_header.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    assert len(tables) == 1
+    assert_frame_equal(df, tables[0].df)
+
+
+# Lattice parser tests
 def test_lattice():
    df = pd.DataFrame(data_lattice)