Initial Hybrid parser, for now identical to Stream

2020-04-19 16:27:01 -07:00 · 2020-04-19 16:27:01 -07:00 · d520a77bb7
parent 58823e57e9
commit d520a77bb7
15 changed files with 726 additions and 19 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)

@click.group(name="camelot")
@click.version_option(version=__version__)
-@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
+@click.option("-q", "--quiet", is_flag=False,
+              help="Suppress logs and warnings.")
@click.option(
    "-p",
    "--pages",
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
 )
@click.option(
-    "-back", "--process_background", is_flag=True, help="Process background lines."
+    "-back", "--process_background", is_flag=True,
+    help="Process background lines."
 )
@click.option(
    "-scale",
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
    "-l",
    "--line_tol",
    default=2,
-    help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
+    help="Tolerance parameter used to merge close vertical"
+    " and horizontal lines.",
 )
@click.option(
    "-j",
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
            raise ImportError("matplotlib is required for plotting.")
    else:
        if output is None:
-            raise click.UsageError("Please specify output file path using --output")
+            raise click.UsageError(
+                "Please specify output file path using --output")
        if f is None:
-            raise click.UsageError("Please specify output file format using --format")
+            raise click.UsageError(
+                "Please specify output file format using --format")

    tables = read_pdf(
-        filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
+        filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
+        **kwargs
    )
    click.echo("Found {} tables".format(tables.n))
    if plot_type is not None:
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
    "-r",
    "--row_tol",
    default=2,
-    help="Tolerance parameter" " used to combine text vertically, to generate rows.",
+    help="Tolerance parameter"
+         " used to combine text vertically, to generate rows.",
 )
@click.option(
    "-c",
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
            raise ImportError("matplotlib is required for plotting.")
    else:
        if output is None:
-            raise click.UsageError("Please specify output file path using --output")
+            raise click.UsageError(
+                "Please specify output file path using --output")
        if f is None:
-            raise click.UsageError("Please specify output file format using --format")
+            raise click.UsageError(
+                "Please specify output file format using --format")

    tables = read_pdf(
        filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
@ -302,3 +311,97 @@ def stream(c, *args, **kwargs):
            plt.show()
    else:
        tables.export(output, f=f, compress=compress)
+
+
+@cli.command("hybrid")
+@click.option(
+    "-R",
+    "--table_regions",
+    default=[],
+    multiple=True,
+    help="Page regions to analyze. Example: x1,y1,x2,y2"
+    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+    "-T",
+    "--table_areas",
+    default=[],
+    multiple=True,
+    help="Table areas to process. Example: x1,y1,x2,y2"
+    " where x1, y1 -> left-top and x2, y2 -> right-bottom.",
+)
+@click.option(
+    "-C",
+    "--columns",
+    default=[],
+    multiple=True,
+    help="X coordinates of column separators.",
+)
+@click.option(
+    "-e",
+    "--edge_tol",
+    default=50,
+    help="Tolerance parameter" " for extending textedges vertically.",
+)
+@click.option(
+    "-r",
+    "--row_tol",
+    default=2,
+    help="Tolerance parameter"
+         " used to combine text vertically, to generate rows.",
+)
+@click.option(
+    "-c",
+    "--column_tol",
+    default=0,
+    help="Tolerance parameter"
+    " used to combine text horizontally, to generate columns.",
+)
+@click.option(
+    "-plot",
+    "--plot_type",
+    type=click.Choice(["text", "grid", "contour", "textedge"]),
+    help="Plot elements found on PDF page for visual debugging.",
+)
+@click.argument("filepath", type=click.Path(exists=True))
+@pass_config
+def hybrid(c, *args, **kwargs):
+    """Use spaces between text to parse the table."""
+    conf = c.config
+    pages = conf.pop("pages")
+    output = conf.pop("output")
+    f = conf.pop("format")
+    compress = conf.pop("zip")
+    quiet = conf.pop("quiet")
+    plot_type = kwargs.pop("plot_type")
+    filepath = kwargs.pop("filepath")
+    kwargs.update(conf)
+
+    table_regions = list(kwargs["table_regions"])
+    kwargs["table_regions"] = None if not table_regions else table_regions
+    table_areas = list(kwargs["table_areas"])
+    kwargs["table_areas"] = None if not table_areas else table_areas
+    columns = list(kwargs["columns"])
+    kwargs["columns"] = None if not columns else columns
+
+    if plot_type is not None:
+        if not _HAS_MPL:
+            raise ImportError("matplotlib is required for plotting.")
+    else:
+        if output is None:
+            raise click.UsageError(
+                "Please specify output file path using --output")
+        if f is None:
+            raise click.UsageError(
+                "Please specify output file format using --format")
+
+    tables = read_pdf(
+        filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
+    )
+    click.echo("Found {} tables".format(tables.n))
+    if plot_type is not None:
+        for table in tables:
+            plot(table, kind=plot_type)
+            plt.show()
+    else:
+        tables.export(output, f=f, compress=compress)
--- a/camelot/core.py
+++ b/camelot/core.py
@ -379,6 +379,8 @@ class Table(object):
        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf

+        self._text = []          # List of text box coordinates
+
    def __repr__(self):
        return "<{} shape={}>".format(self.__class__.__name__, self.shape)

@ -432,11 +434,11 @@ class Table(object):
        self.pdf_size = (parser.pdf_width, parser.pdf_height)

        _text = []
-        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
+        _text.extend(
+            [(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
        self._text = _text

-
    def get_pdf_image(self):
        """Compute pdf image and cache it
        """
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -7,7 +7,7 @@ import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter

 from .core import TableList
-from .parsers import Stream, Lattice
+from .parsers import Stream, Lattice, Hybrid
 from .utils import (
    build_file_path_in_temp_dir,
    get_page_layout,
@ -21,7 +21,8 @@ logger = logging.getLogger("camelot")

 PARSERS = {
    "lattice": Lattice,
-    "stream": Stream
+    "stream": Stream,
+    "hybrid": Hybrid,
 }


@ -173,7 +174,7 @@ class PDFHandler():
        Parameters
        ----------
        flavor : str (default: 'lattice')
-            The parsing method to use ('lattice' or 'stream').
+            The parsing method to use ('lattice', 'stream', or 'hybrid').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
--- a/camelot/io.py
+++ b/camelot/io.py
@ -99,9 +99,10 @@ def read_pdf(

    """
    layout_kwargs = layout_kwargs or {}
-    if flavor not in ["lattice", "stream"]:
+    if flavor not in ["lattice", "stream", "hybrid"]:
        raise NotImplementedError(
-            "Unknown flavor specified." " Use either 'lattice' or 'stream'"
+            "Unknown flavor specified."
+            " Use either 'lattice', 'stream', or 'hybrid'"
        )

    with warnings.catch_warnings():
--- a/camelot/parsers/init.py
+++ b/camelot/parsers/init.py
@ -2,3 +2,4 @@

 from .stream import Stream
 from .lattice import Lattice
+from .hybrid import Hybrid
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -0,0 +1,441 @@
+
+
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+import warnings
+
+import numpy as np
+
+from .base import BaseParser
+from ..core import TextEdges
+from ..utils import (text_in_bbox, text_in_bbox_per_axis)
+
+
+class Hybrid(BaseParser):
+    """Hybrid method of parsing looks for spaces between text
+    to parse the table.
+
+    If you want to specify columns when specifying multiple table
+    areas, make sure that the length of both lists are equal.
+
+    Parameters
+    ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    table_areas : list, optional (default: None)
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
+    columns : list, optional (default: None)
+        List of column x-coordinates strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Split text that spans across multiple cells.
+    flag_size : bool, optional (default: False)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds <s></s> around flagged text.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
+    edge_tol : int, optional (default: 50)
+        Tolerance parameter for extending textedges vertically.
+    row_tol : int, optional (default: 2)
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
+    column_tol : int, optional (default: 0)
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
+
+    """
+
+    def __init__(
+        self,
+        table_regions=None,
+        table_areas=None,
+        columns=None,
+        flag_size=False,
+        split_text=False,
+        strip_text="",
+        edge_tol=50,
+        row_tol=2,
+        column_tol=0,
+        **kwargs
+    ):
+        super().__init__(
+            "hybrid",
+            table_regions=table_regions,
+            table_areas=table_areas,
+            split_text=split_text,
+            strip_text=strip_text,
+            flag_size=flag_size,
+        )
+        self.columns = columns
+        self._validate_columns()
+        self.edge_tol = edge_tol
+        self.row_tol = row_tol
+        self.column_tol = column_tol
+
+    @staticmethod
+    def _text_bbox(t_bbox):
+        """Returns bounding box for the text present on a page.
+
+        Parameters
+        ----------
+        t_bbox : dict
+            Dict with two keys 'horizontal' and 'vertical' with lists of
+            LTTextLineHorizontals and LTTextLineVerticals respectively.
+
+        Returns
+        -------
+        text_bbox : tuple
+            Tuple (x0, y0, x1, y1) in pdf coordinate space.
+
+        """
+        xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
+        ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
+        xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
+        ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
+        text_bbox = (xmin, ymin, xmax, ymax)
+        return text_bbox
+
+    @staticmethod
+    def _group_rows(text, row_tol=2):
+        """Groups PDFMiner text objects into rows vertically
+        within a tolerance.
+
+        Parameters
+        ----------
+        text : list
+            List of PDFMiner text objects.
+        row_tol : int, optional (default: 2)
+
+        Returns
+        -------
+        rows : list
+            Two-dimensional list of text objects grouped into rows.
+
+        """
+        row_y = None
+        rows = []
+        temp = []
+        non_empty_text = [t for t in text if t.get_text().strip()]
+        for t in non_empty_text:
+            # is checking for upright necessary?
+            # if t.get_text().strip() and all([obj.upright \
+            #   for obj in t._objs
+            # if type(obj) is LTChar]):
+            if row_y is None:
+                row_y = t.y0
+            elif not np.isclose(row_y, t.y0, atol=row_tol):
+                rows.append(sorted(temp, key=lambda t: t.x0))
+                temp = []
+                # We update the row's bottom as we go, to be forgiving if there
+                # is a gradual change across multiple columns.
+                row_y = t.y0
+            temp.append(t)
+        rows.append(sorted(temp, key=lambda t: t.x0))
+        return rows
+
+    @staticmethod
+    def _merge_columns(l, column_tol=0):
+        """Merges column boundaries horizontally if they overlap
+        or lie within a tolerance.
+
+        Parameters
+        ----------
+        l : list
+            List of column x-coordinate tuples.
+        column_tol : int, optional (default: 0)
+
+        Returns
+        -------
+        merged : list
+            List of merged column x-coordinate tuples.
+
+        """
+        merged = []
+        for higher in l:
+            if not merged:
+                merged.append(higher)
+            else:
+                lower = merged[-1]
+                if column_tol >= 0:
+                    if higher[0] <= lower[1] or np.isclose(
+                        higher[0], lower[1], atol=column_tol
+                    ):
+                        upper_bound = max(lower[1], higher[1])
+                        lower_bound = min(lower[0], higher[0])
+                        merged[-1] = (lower_bound, upper_bound)
+                    else:
+                        merged.append(higher)
+                elif column_tol < 0:
+                    if higher[0] <= lower[1]:
+                        if np.isclose(higher[0], lower[1],
+                                      atol=abs(column_tol)):
+                            merged.append(higher)
+                        else:
+                            upper_bound = max(lower[1], higher[1])
+                            lower_bound = min(lower[0], higher[0])
+                            merged[-1] = (lower_bound, upper_bound)
+                    else:
+                        merged.append(higher)
+        return merged
+
+    @staticmethod
+    def _join_rows(rows_grouped, text_y_max, text_y_min):
+        """Makes row coordinates continuous. For the row to "touch"
+        we split the existing gap between them in half.
+
+        Parameters
+        ----------
+        rows_grouped : list
+            Two-dimensional list of text objects grouped into rows.
+        text_y_max : int
+        text_y_min : int
+
+        Returns
+        -------
+        rows : list
+            List of continuous row y-coordinate tuples.
+
+        """
+        row_boundaries = [
+            [
+                max(t.y1 for t in r),
+                min(t.y0 for t in r)
+            ]
+            for r in rows_grouped
+        ]
+        for i in range(0, len(row_boundaries)-1):
+            top_row = row_boundaries[i]
+            bottom_row = row_boundaries[i+1]
+            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
+        row_boundaries[0][0] = text_y_max
+        row_boundaries[-1][1] = text_y_min
+        return row_boundaries
+
+    @staticmethod
+    def _add_columns(cols, text, row_tol):
+        """Adds columns to existing list by taking into account
+        the text that lies outside the current column x-coordinates.
+
+        Parameters
+        ----------
+        cols : list
+            List of column x-coordinate tuples.
+        text : list
+            List of PDFMiner text objects.
+        ytol : int
+
+        Returns
+        -------
+        cols : list
+            Updated list of column x-coordinate tuples.
+
+        """
+        if text:
+            text = Hybrid._group_rows(text, row_tol=row_tol)
+            elements = [len(r) for r in text]
+            new_cols = [
+                (t.x0, t.x1)
+                for r in text if len(r) == max(elements)
+                for t in r
+            ]
+            cols.extend(Hybrid._merge_columns(sorted(new_cols)))
+        return cols
+
+    @staticmethod
+    def _join_columns(cols, text_x_min, text_x_max):
+        """Makes column coordinates continuous.
+
+        Parameters
+        ----------
+        cols : list
+            List of column x-coordinate tuples.
+        text_x_min : int
+        text_y_max : int
+
+        Returns
+        -------
+        cols : list
+            Updated list of column x-coordinate tuples.
+
+        """
+        cols = sorted(cols)
+        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
+        cols.insert(0, text_x_min)
+        cols.append(text_x_max)
+        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        return cols
+
+    def _validate_columns(self):
+        if self.table_areas is not None and self.columns is not None:
+            if len(self.table_areas) != len(self.columns):
+                raise ValueError("Length of table_areas and columns"
+                                 " should be equal")
+
+    def _nurminen_table_detection(self, textlines):
+        """A general implementation of the table detection algorithm
+        described by Anssi Nurminen's master's thesis.
+        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
+
+        Assumes that tables are situated relatively far apart
+        vertically.
+        """
+        # TODO: add support for arabic text #141
+        # sort textlines in reading order
+        textlines.sort(key=lambda x: (-x.y0, x.x0))
+        textedges = TextEdges(edge_tol=self.edge_tol)
+        # generate left, middle and right textedges
+        textedges.generate(textlines)
+        # select relevant edges
+        relevant_textedges = textedges.get_relevant()
+        self.textedges.extend(relevant_textedges)
+        # guess table areas using textlines and relevant edges
+        table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
+        # treat whole page as table area if no table areas found
+        if not table_bbox:
+            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
+
+        return table_bbox
+
+    def _generate_table_bbox(self):
+        self.textedges = []
+        if self.table_areas is None:
+            hor_text = self.horizontal_text
+            if self.table_regions is not None:
+                # filter horizontal text
+                hor_text = []
+                for region in self.table_regions:
+                    x1, y1, x2, y2 = region.split(",")
+                    x1 = float(x1)
+                    y1 = float(y1)
+                    x2 = float(x2)
+                    y2 = float(y2)
+                    region_text = text_in_bbox(
+                        (x1, y2, x2, y1), self.horizontal_text)
+                    hor_text.extend(region_text)
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(hor_text)
+        else:
+            table_bbox = {}
+            for area in self.table_areas:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = float(x1)
+                y1 = float(y1)
+                x2 = float(x2)
+                y2 = float(y2)
+                table_bbox[(x1, y2, x2, y1)] = None
+        self.table_bbox = table_bbox
+
+    def _generate_columns_and_rows(self, table_idx, tk):
+        # select elements which lie within table_bbox
+        self.t_bbox = text_in_bbox_per_axis(
+            tk,
+            self.horizontal_text,
+            self.vertical_text
+        )
+
+        text_x_min, text_y_min, text_x_max, text_y_max = \
+            self._text_bbox(self.t_bbox)
+        rows_grouped = self._group_rows(
+            self.t_bbox["horizontal"], row_tol=self.row_tol)
+        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
+        elements = [len(r) for r in rows_grouped]
+
+        if self.columns is not None and self.columns[table_idx] != "":
+            # user has to input boundary columns too
+            # take (0, pdf_width) by default
+            # similar to else condition
+            # len can't be 1
+            cols = self.columns[table_idx].split(",")
+            cols = [float(c) for c in cols]
+            cols.insert(0, text_x_min)
+            cols.append(text_x_max)
+            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        else:
+            # calculate mode of the list of number of elements in
+            # each row to guess the number of columns
+            ncols = max(set(elements), key=elements.count)
+            if ncols == 1:
+                # if mode is 1, the page usually contains not tables
+                # but there can be cases where the list can be skewed,
+                # try to remove all 1s from list in this case and
+                # see if the list contains elements, if yes, then use
+                # the mode after removing 1s
+                elements = list(filter(lambda x: x != 1, elements))
+                if elements:
+                    ncols = max(set(elements), key=elements.count)
+                else:
+                    warnings.warn(
+                        "No tables found in table area {}"
+                        .format(table_idx + 1)
+                    )
+            cols = [
+                (t.x0, t.x1)
+                for r in rows_grouped
+                if len(r) == ncols
+                for t in r
+            ]
+            cols = self._merge_columns(
+                sorted(cols),
+                column_tol=self.column_tol
+            )
+            inner_text = []
+            for i in range(1, len(cols)):
+                left = cols[i - 1][1]
+                right = cols[i][0]
+                inner_text.extend(
+                    [
+                        t
+                        for direction in self.t_bbox
+                        for t in self.t_bbox[direction]
+                        if t.x0 > left and t.x1 < right
+                    ]
+                )
+            outer_text = [
+                t
+                for direction in self.t_bbox
+                for t in self.t_bbox[direction]
+                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
+            ]
+            inner_text.extend(outer_text)
+            cols = self._add_columns(cols, inner_text, self.row_tol)
+            cols = self._join_columns(cols, text_x_min, text_x_max)
+
+        return cols, rows
+
+    def _generate_table(self, table_idx, cols, rows, **kwargs):
+        table = self._initialize_new_table(table_idx, cols, rows)
+        table = table.set_all_edges()
+        table.record_parse_metadata(self)
+
+        # for plotting
+        table._bbox = self.table_bbox
+        table._segments = None
+        table._textedges = self.textedges
+
+        return table
+
+    def extract_tables(self, filename):
+        if self._document_has_no_text():
+            return []
+
+        # Identify plausible areas within the doc where tables lie,
+        # populate table_bbox keys with these areas.
+        self._generate_table_bbox()
+
+        _tables = []
+        # sort tables based on y-coord
+        for table_idx, bbox in enumerate(
+            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
+        ):
+            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
+            table = self._generate_table(table_idx, cols, rows)
+            table._bbox = bbox
+            _tables.append(table)
+
+        return _tables
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -252,7 +252,6 @@ class Lattice(BaseParser):
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )

-
    def _generate_columns_and_rows(self, tk):
        # select elements which lie within table_bbox
        v_s, h_s = segments_in_bbox(
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -37,7 +37,7 @@ class PlotMethods(object):
            raise NotImplementedError(
                "Lattice flavor does not support kind='{}'".format(kind)
            )
-        elif table.flavor == "stream" and kind in ["line"]:
+        elif table.flavor in ["stream", "hybrid"] and kind in ["line"]:
            raise NotImplementedError(
                "Stream flavor does not support kind='{}'".format(kind)
            )
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_grid_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_grid_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -72,6 +72,26 @@ def test_cli_stream():
        assert format_error in result.output


+def test_cli_hybrid():
+    with TemporaryDirectory() as tempdir:
+        infile = os.path.join(testdir, "budget.pdf")
+        outfile = os.path.join(tempdir, "budget.csv")
+        runner = CliRunner()
+        result = runner.invoke(
+            cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
+        )
+        assert result.exit_code == 0
+        assert result.output == "Found 1 tables\n"
+
+        result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
+        output_error = "Error: Please specify output file path using --output"
+        assert output_error in result.output
+
+        result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
+        format_error = "Please specify output file format using --format"
+        assert format_error in result.output
+
+
 def test_cli_password():
    with TemporaryDirectory() as tempdir:
        infile = os.path.join(testdir, "health_protected.pdf")
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -148,6 +148,115 @@ def test_stream_layout_kwargs():
    assert_frame_equal(df, tables[0].df)


+def test_hybrid():
+    df = pd.DataFrame(data_stream)
+
+    filename = os.path.join(testdir, "health.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_table_rotated():
+    df = pd.DataFrame(data_stream_table_rotated)
+
+    filename = os.path.join(testdir, "clockwise_table_2.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    assert_frame_equal(df, tables[0].df)
+
+    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_two_tables():
+    df1 = pd.DataFrame(data_stream_two_tables_1)
+    df2 = pd.DataFrame(data_stream_two_tables_2)
+
+    filename = os.path.join(testdir, "tabula/12s0324.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+
+    assert len(tables) == 2
+    assert df1.equals(tables[0].df)
+    assert df2.equals(tables[1].df)
+
+
+def test_hybrid_table_regions():
+    df = pd.DataFrame(data_stream_table_areas)
+
+    filename = os.path.join(testdir, "tabula/us-007.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", table_regions=["320,460,573,335"]
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_table_areas():
+    df = pd.DataFrame(data_stream_table_areas)
+
+    filename = os.path.join(testdir, "tabula/us-007.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", table_areas=["320,500,573,335"]
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_columns():
+    df = pd.DataFrame(data_stream_columns)
+
+    filename = os.path.join(testdir, "mexican_towns.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_split_text():
+    df = pd.DataFrame(data_stream_split_text)
+
+    filename = os.path.join(testdir, "tabula/m27.pdf")
+    tables = camelot.read_pdf(
+        filename,
+        flavor="hybrid",
+        columns=["72,95,209,327,442,529,566,606,683"],
+        split_text=True,
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_flag_size():
+    df = pd.DataFrame(data_stream_flag_size)
+
+    filename = os.path.join(testdir, "superscript.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_strip_text():
+    df = pd.DataFrame(data_stream_strip_text)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_edge_tol():
+    df = pd.DataFrame(data_stream_edge_tol)
+
+    filename = os.path.join(testdir, "edge_tol.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_hybrid_layout_kwargs():
+    df = pd.DataFrame(data_stream_layout_kwargs)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
 def test_lattice():
    df = pd.DataFrame(data_lattice)

--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -55,6 +55,16 @@ def test_stream_grid_plot():
    return camelot.plot(tables[0], kind='grid')


+@pytest.mark.skipif(LEGACY_MATPLOTLIB,
+                    reason="depends on a recent version of MatPlotLib")
+@pytest.mark.mpl_image_compare(
+    baseline_dir="files/baseline_plots", remove_text=True)
+def test_hybrid_grid_plot():
+    filename = os.path.join(testdir, "foo.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    return camelot.plot(tables[0], kind='grid')
+
+
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True)
 def test_lattice_contour_plot():
@ -73,6 +83,16 @@ def test_stream_contour_plot():
    return camelot.plot(tables[0], kind='contour')


+@pytest.mark.skipif(LEGACY_MATPLOTLIB,
+                    reason="depends on a recent version of MatPlotLib")
+@pytest.mark.mpl_image_compare(
+    baseline_dir="files/baseline_plots", remove_text=True)
+def test_hybrid_contour_plot():
+    filename = os.path.join(testdir, "tabula/12s0324.pdf")
+    tables = camelot.read_pdf(filename, flavor='hybrid')
+    return camelot.plot(tables[0], kind='contour')
+
+
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
                    reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
@ -97,7 +117,17 @@ def test_joint_plot():
                    reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True)
-def test_textedge_plot():
+def test_stream_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
    tables = camelot.read_pdf(filename, flavor='stream')
    return camelot.plot(tables[0], kind='textedge')
+
+
+@pytest.mark.skipif(LEGACY_MATPLOTLIB,
+                    reason="depends on a recent version of MatPlotLib")
+@pytest.mark.mpl_image_compare(
+    baseline_dir="files/baseline_plots", remove_text=True)
+def test_hybrid_textedge_plot():
+    filename = os.path.join(testdir, "tabula/12s0324.pdf")
+    tables = camelot.read_pdf(filename, flavor='hybrid')
+    return camelot.plot(tables[0], kind='textedge')