Rename WIP parser "network", actual Hybrid to come

2020-05-02 16:14:03 -07:00 · 2020-05-02 16:14:03 -07:00 · 6711f877bf
parent c7ab3a4c32
commit 6711f877bf
18 changed files with 120 additions and 118 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -313,7 +313,7 @@ def stream(c, *args, **kwargs):
        tables.export(output, f=f, compress=compress)


-@cli.command("hybrid")
+@cli.command("network")
@click.option(
    "-R",
    "--table_regions",
@ -365,7 +365,7 @@ def stream(c, *args, **kwargs):
 )
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
-def hybrid(c, *args, **kwargs):
+def network(c, *args, **kwargs):
    """Use spaces between text to parse the table."""
    conf = c.config
    pages = conf.pop("pages")
@ -396,7 +396,7 @@ def hybrid(c, *args, **kwargs):
                "Please specify output file format using --format")

    tables = read_pdf(
-        filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
+        filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
    )
    click.echo("Found {} tables".format(tables.n))
    if plot_type is not None:
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -7,7 +7,7 @@ import logging
 from PyPDF2 import PdfFileReader, PdfFileWriter

 from .core import TableList
-from .parsers import Stream, Lattice, Hybrid
+from .parsers import Stream, Lattice, Network
 from .utils import (
    build_file_path_in_temp_dir,
    get_page_layout,
@ -22,7 +22,7 @@ logger = logging.getLogger("camelot")
 PARSERS = {
    "lattice": Lattice,
    "stream": Stream,
-    "hybrid": Hybrid,
+    "network": Network,
 }


@ -177,7 +177,7 @@ class PDFHandler():
        Parameters
        ----------
        flavor : str (default: 'lattice')
-            The parsing method to use ('lattice', 'stream', or 'hybrid').
+            The parsing method to use ('lattice', 'stream', or 'network').
            Lattice is used by default.
        suppress_stdout : str (default: False)
            Suppress logs and warnings.
--- a/camelot/io.py
+++ b/camelot/io.py
@ -99,10 +99,10 @@ def read_pdf(

    """
    layout_kwargs = layout_kwargs or {}
-    if flavor not in ["lattice", "stream", "hybrid"]:
+    if flavor not in ["lattice", "stream", "network"]:
        raise NotImplementedError(
            "Unknown flavor specified."
-            " Use either 'lattice', 'stream', or 'hybrid'"
+            " Use either 'lattice', 'stream', or 'network'"
        )

    with warnings.catch_warnings():
--- a/camelot/parsers/init.py
+++ b/camelot/parsers/init.py
@ -2,4 +2,4 @@

 from .stream import Stream
 from .lattice import Lattice
-from .hybrid import Hybrid
+from .network import Network
--- a/camelot/parsers/network.py
+++ b/camelot/parsers/network.py
@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Implementation of hybrid table parser."""
+"""Implementation of network table parser."""

 from __future__ import division

@ -391,7 +391,7 @@ class TextNetworks(TextAlignments):
        return gaps_hv

    def search_table_body(self, gaps_hv, parse_details=None):
-        """ Build a candidate bbox for the body of a table using hybrid algo
+        """ Build a candidate bbox for the body of a table using network algo

        Seed the process with the textline with the highest alignment
        score, then expand the bbox with textlines within threshold.
@ -496,7 +496,7 @@ class TextNetworks(TextAlignments):
                    if textline in new_tls:
                        del tls_search_space[i]

-        if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
+        if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
            return bbox
        return None

@ -508,8 +508,8 @@ class TextNetworks(TextAlignments):
        self._compute_alignment_counts()


-class Hybrid(TextBaseParser):
-    """Hybrid method of parsing looks for spaces between text
+class Network(TextBaseParser):
+    """Network method of parsing looks for spaces between text
    to parse the table.

    If you want to specify columns when specifying multiple table
@ -561,7 +561,7 @@ class Hybrid(TextBaseParser):
            debug=False,
            **kwargs):
        super().__init__(
-            "hybrid",
+            "network",
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -203,7 +203,7 @@ class PlotMethods():
            raise NotImplementedError(
                "Lattice flavor does not support kind='{}'".format(kind)
            )
-        if table.flavor in ["stream", "hybrid"] and kind in ["line"]:
+        if table.flavor in ["stream", "network"] and kind in ["line"]:
            raise NotImplementedError(
                "Stream flavor does not support kind='{}'".format(kind)
            )
@ -313,7 +313,7 @@ class PlotMethods():
        ax = prepare_plot(table, ax)
        draw_text(table, ax)

-        if table.flavor == "hybrid":
+        if table.flavor == "network":
            for network in table.parse_details["network_searches"]:
                most_connected_tl = network.most_connected_textline()

@ -428,8 +428,8 @@ class PlotMethods():
        return ax.get_figure()

    @staticmethod
-    def hybrid_table_search(table, ax=None):
-        """Generates a plot illustrating the steps of the hybrid table search.
+    def network_table_search(table, ax=None):
+        """Generates a plot illustrating the steps of the network table search.

        Parameters
        ----------
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -824,8 +824,8 @@ data_stream_table_rotated = [
 ]

 # The streaming algorithm incorrectly includes a header in the result.
-# Trimming the table for the test of hybrid, which doesn't include it.
-data_hybrid_table_rotated = data_stream_table_rotated[1:]
+# Trimming the table for the test of network, which doesn't include it.
+data_network_table_rotated = data_stream_table_rotated[1:]

 data_stream_two_tables_1 = [
    [
@ -1298,8 +1298,8 @@ data_stream_two_tables_1 = [
 ]

 # The streaming algorithm incorrectly includes a header and a footer.
-# Trimming the table for the test of hybrid, which doesn't include it.
-data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1]
+# Trimming the table for the test of network, which doesn't include it.
+data_network_two_tables_1 = data_stream_two_tables_1[3:-1]

 data_stream_two_tables_2 = [
    ["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
@ -1609,7 +1609,7 @@ data_stream_two_tables_2 = [
    ["1 Except forcible rape and prostitution.", "", "", "", "", ""],
 ]

-data_hybrid_two_tables_b_1 = [
+data_network_two_tables_b_1 = [
    ["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
    ["Vgvhgh", "Hj", "Hj", "Hj"],
    ["Hj", "Hj", "Hj", "Hj"],
@ -1619,17 +1619,17 @@ data_hybrid_two_tables_b_1 = [
    ["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
 ]

-data_hybrid_two_tables_b_2 = [
+data_network_two_tables_b_2 = [
    ["Trtrt", "H", "Gh"],
    ["Gh", "V", "Hv"],
    ["Hv", "Bhjb", "hg"],
 ]

 # The streaming algorithm incorrectly includes a header and a footer.
-# Trimming the table for the test of hybrid, which doesn't include it.
-data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
+# Trimming the table for the test of network, which doesn't include it.
+data_network_two_tables_2 = data_stream_two_tables_2[3:-1]

-data_hybrid_vertical_headers = [
+data_network_vertical_headers = [
    [
        "",
        "",
@ -2090,8 +2090,8 @@ data_stream_table_areas = [
    ["(each day of the payroll period)", ""],
 ]

-# Hybrid doesn't recognize the footer as belonging to the table.
-data_hybrid_table_regions = data_stream_table_areas[:-1]
+# Network doesn't recognize the footer as belonging to the table.
+data_network_table_regions = data_stream_table_areas[:-1]

 data_stream_columns = [
    [
@ -2613,9 +2613,9 @@ data_stream_split_text = [
 ]

 # The stream algorithm excludes the string "Alphabetic Listing by type"
-data_hybrid_split_text = []
-data_hybrid_split_text.extend(data_stream_split_text)
-data_hybrid_split_text[0] = [
+data_network_split_text = []
+data_network_split_text.extend(data_stream_split_text)
+data_network_split_text[0] = [
    'FEB', 'RUAR', 'Y 2014 M27 (BUS)', '',
    'ALPHABETIC LISTING BY T', 'YPE', '', '', '', 'ABLPDM27'
 ]
@ -2851,15 +2851,15 @@ data_stream_flag_size = [
    ],
 ]

-# Hybrid adds more content into the header.
-data_hybrid_flag_size = [
+# Network adds more content into the header.
+data_network_flag_size = [
    ['', '', '', '', '(As at end-March)', '', '', '', '', '', ''],
    ['', '', '', '', '', '', '', '', '', '', '(` Billion)']
 ]
-data_hybrid_flag_size.extend(data_stream_flag_size)
+data_network_flag_size.extend(data_stream_flag_size)


-data_hybrid_strip_text = [
+data_network_strip_text = [
    ["VinsauVerre", ""],
    ["LesBlancs", "12.5CL"],
    ["A.O.PCôtesduRhône", ""],
@ -2900,7 +2900,7 @@ data_hybrid_strip_text = [
 ]

 # Stream only detects part of the table
-data_stream_strip_text = data_hybrid_strip_text[0:-13]
+data_stream_strip_text = data_network_strip_text[0:-13]

 data_stream_edge_tol = [
    ["Key figures", ""],
@ -2940,9 +2940,9 @@ data_stream_edge_tol = [
    ["period.", ""],
 ]

-# The stream algorithm ends up including a footer, which hybrid correctly
+# The stream algorithm ends up including a footer, which network correctly
 # skips.
-data_hybrid_edge_tol = data_stream_edge_tol[:-3]
+data_network_edge_tol = data_stream_edge_tol[:-3]

 data_lattice = [
    [
--- a/tests/files/baseline_plots/test_network_contour_plot.png
+++ b/tests/files/baseline_plots/test_network_contour_plot.png
--- a/tests/files/baseline_plots/test_network_grid_plot.png
+++ b/tests/files/baseline_plots/test_network_grid_plot.png
--- a/tests/files/baseline_plots/test_network_table_areas_text_plot.png
+++ b/tests/files/baseline_plots/test_network_table_areas_text_plot.png
--- a/tests/files/baseline_plots/test_network_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_network_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_network_textedge_plot.png
+++ b/tests/files/baseline_plots/test_network_textedge_plot.png
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -72,22 +72,22 @@ def test_cli_stream():
        assert format_error in result.output


-def test_cli_hybrid():
+def test_cli_network():
    with TemporaryDirectory() as tempdir:
        infile = os.path.join(testdir, "budget.pdf")
        outfile = os.path.join(tempdir, "budget.csv")
        runner = CliRunner()
        result = runner.invoke(
-            cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
+            cli, ["--format", "csv", "--output", outfile, "network", infile]
        )
        assert result.exit_code == 0
        assert result.output == "Found 1 tables\n"

-        result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
+        result = runner.invoke(cli, ["--format", "csv", "network", infile])
        output_error = "Error: Please specify output file path using --output"
        assert output_error in result.output

-        result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
+        result = runner.invoke(cli, ["--output", outfile, "network", infile])
        format_error = "Please specify output file format using --format"
        assert format_error in result.output

--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -150,32 +150,32 @@ def test_stream_layout_kwargs():
    assert_frame_equal(df, tables[0].df)


-def test_hybrid():
+def test_network():
    df = pd.DataFrame(data_stream)

    filename = os.path.join(testdir, "health.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_table_rotated():
-    df = pd.DataFrame(data_hybrid_table_rotated)
+def test_network_table_rotated():
+    df = pd.DataFrame(data_network_table_rotated)

    filename = os.path.join(testdir, "clockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)

    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_two_tables_a():
-    df1 = pd.DataFrame(data_hybrid_two_tables_1)
-    df2 = pd.DataFrame(data_hybrid_two_tables_2)
+def test_network_two_tables_a():
+    df1 = pd.DataFrame(data_network_two_tables_1)
+    df2 = pd.DataFrame(data_network_two_tables_2)

    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")

    assert len(tables) == 2
    assert df1.equals(tables[0].df)
@ -183,104 +183,104 @@ def test_hybrid_two_tables_a():


 # Reported as https://github.com/camelot-dev/camelot/issues/132
-def test_hybrid_two_tables_b():
-    df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
-    df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
+def test_network_two_tables_b():
+    df1 = pd.DataFrame(data_network_two_tables_b_1)
+    df2 = pd.DataFrame(data_network_two_tables_b_2)

    filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")

    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)


-def test_hybrid_vertical_header():
+def test_network_vertical_header():
    """Tests a complex table with a vertically text header.
    """
-    df = pd.DataFrame(data_hybrid_vertical_headers)
+    df = pd.DataFrame(data_network_vertical_headers)

    filename = os.path.join(testdir, "vertical_header.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_table_regions():
-    df = pd.DataFrame(data_hybrid_table_regions)
+def test_network_table_regions():
+    df = pd.DataFrame(data_network_table_regions)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
    # The "stream" test looks for a region in ["320,460,573,335"], which
    # should exclude the header.
    tables = camelot.read_pdf(
-        filename, flavor="hybrid", table_regions=["320,335,573,505"]
+        filename, flavor="network", table_regions=["320,335,573,505"]
    )
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_table_areas():
+def test_network_table_areas():
    df = pd.DataFrame(data_stream_table_areas)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="hybrid", table_areas=["320,500,573,335"]
+        filename, flavor="network", table_areas=["320,500,573,335"]
    )
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_columns():
+def test_network_columns():
    df = pd.DataFrame(data_stream_columns)

    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
+        filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
    )
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_split_text():
-    df = pd.DataFrame(data_hybrid_split_text)
+def test_network_split_text():
+    df = pd.DataFrame(data_network_split_text)

    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
        filename,
-        flavor="hybrid",
+        flavor="network",
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_flag_size():
-    df = pd.DataFrame(data_hybrid_flag_size)
+def test_network_flag_size():
+    df = pd.DataFrame(data_network_flag_size)

    filename = os.path.join(testdir, "superscript.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
+    tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_strip_text():
-    df = pd.DataFrame(data_hybrid_strip_text)
+def test_network_strip_text():
+    df = pd.DataFrame(data_network_strip_text)

    filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
+    tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_edge_tol():
-    df = pd.DataFrame(data_hybrid_edge_tol)
+def test_network_edge_tol():
+    df = pd.DataFrame(data_network_edge_tol)

    filename = os.path.join(testdir, "edge_tol.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
+    tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
    assert_frame_equal(df, tables[0].df)


-def test_hybrid_layout_kwargs():
+def test_network_layout_kwargs():
    df = pd.DataFrame(data_stream_layout_kwargs)

    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
+        filename, flavor="network", layout_kwargs={"detect_vertical": False}
    )
    assert_frame_equal(df, tables[0].df)

--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@ -15,7 +15,7 @@ filename = os.path.join(testdir, 'foo.pdf')

 def test_unknown_flavor():
    message = ("Unknown flavor specified."
-               " Use either 'lattice', 'stream', or 'hybrid'")
+               " Use either 'lattice', 'stream', or 'network'")
    with pytest.raises(NotImplementedError, match=message):
        camelot.read_pdf(filename, flavor='chocolate')

--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -62,9 +62,9 @@ def test_stream_grid_plot():

@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
-def test_hybrid_grid_plot():
+def test_network_grid_plot():
    filename = os.path.join(testdir, "foo.pdf")
-    tables = camelot.read_pdf(filename, flavor="hybrid")
+    tables = camelot.read_pdf(filename, flavor="network")
    return unit_test_stable_plot(tables[0], 'grid')


@ -86,9 +86,9 @@ def test_stream_contour_plot():

@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
-def test_hybrid_contour_plot():
+def test_network_contour_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor='hybrid')
+    tables = camelot.read_pdf(filename, flavor='network')
    return unit_test_stable_plot(tables[0], 'contour')


@ -118,18 +118,18 @@ def test_stream_textedge_plot():

@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
-def test_hybrid_textedge_plot():
+def test_network_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
+    tables = camelot.read_pdf(filename, debug=True, flavor='network')
    return unit_test_stable_plot(tables[0], 'textedge')


@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
-def test_hybrid_table_regions_textedge_plot():
+def test_network_table_regions_textedge_plot():
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, debug=True, flavor="hybrid",
+        filename, debug=True, flavor="network",
        table_regions=["320,505,573,330"]
    )
    return unit_test_stable_plot(tables[0], 'textedge')
@ -137,10 +137,10 @@ def test_hybrid_table_regions_textedge_plot():

@pytest.mark.mpl_image_compare(
    baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
-def test_hybrid_table_areas_text_plot():
+def test_network_table_areas_text_plot():
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, debug=True, flavor="hybrid",
+        filename, debug=True, flavor="network",
        table_areas=["320,500,573,335"]
    )
    return unit_test_stable_plot(tables[0], 'text')