Fix issues following pass across most test cases

* Clean up the parser comparison notebook * Address issue where hybrid didn't honor the columns parameter * Fix dropping of empty rows/columns in hybrid * Hybrid learns table y-dimensions from lattice
2020-06-16 13:04:53 -07:00 · 2020-06-16 13:04:53 -07:00 · 71805f9333
parent 9c971a18f0
commit 71805f9333
5 changed files with 110 additions and 129 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -5,6 +5,7 @@ from ..utils import (
    boundaries_to_split_lines,
 )

+import numpy as np
 from .base import BaseParser
 from .network import Network
 from .lattice import Lattice
@ -67,6 +68,7 @@ class Hybrid(BaseParser):
            strip_text=strip_text,
            debug=debug,
        )
+        self.columns = columns  # Columns settings impacts the hybrid table
        self.network_parser = Network(
            table_regions=table_regions,
            table_areas=table_areas,
@ -109,9 +111,11 @@ class Hybrid(BaseParser):
        table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
        # Because hybrid can inject extraneous splits from both lattice and
        # network, remove lines / cols that are completely empty.
-        df = table.df
-        df[df.astype(bool)].dropna(axis=0, how="all", inplace=True)
-        df[df.astype(bool)].dropna(axis=1, how="all", inplace=True)
+        table.df = table.df.replace('', np.nan)
+        table.df = table.df.dropna(axis=0, how="all")
+        table.df = table.df.dropna(axis=1, how="all")
+        table.df = table.df.replace(np.nan, '')
+        table.shape = table.df.shape
        return table

    @staticmethod
@ -172,13 +176,12 @@ class Hybrid(BaseParser):
        """ Identify splits that were only detected by lattice or by network
        """
        lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
-        lattice_cols, lattice_rows = \
-            lattice_parse["col_anchors"], lattice_parse["row_anchors"]
+        lattice_cols = lattice_parse["col_anchors"]

        network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
        network_cols_boundaries = network_bbox_data["cols_boundaries"]

-        # Favor hybrid, but complete or adjust its columns based on the
+        # Favor network, but complete or adjust its columns based on the
        # splits identified by lattice.
        if network_cols_boundaries is None:
            self.table_bbox_parses[lattice_bbox] = self.lattice_parser
@ -188,8 +191,10 @@ class Hybrid(BaseParser):
                lattice_cols,
                self.lattice_parser.joint_tol)
            augmented_bbox = (
-                network_cols_boundaries[0][0], network_bbox[1],
-                network_cols_boundaries[-1][1], network_bbox[3],
+                network_cols_boundaries[0][0],
+                min(lattice_bbox[1], network_bbox[1]),
+                network_cols_boundaries[-1][1],
+                max(lattice_bbox[3], network_bbox[3]),
            )
            network_bbox_data["cols_anchors"] = \
                boundaries_to_split_lines(network_cols_boundaries)
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -1,7 +1,11 @@
 # -*- coding: utf-8 -*-


-data_stream = [
+data_hybrid = [
+    [
+        "", "Table: 5            Public Health Outlay 2012-13 (Budget"
+        " Estimates)        (Rs. in 000)", "", "", "", "", "", ""
+    ],
    ["States-A", "Revenue", "", "Capital", "", "Total", "Others(1)", "Total"],
    ["", "", "", "", "", "Revenue &", "", ""],
    ["", "Medical &", "Family", "Medical &", "Family", "", "", ""],
@ -224,6 +228,10 @@ data_stream = [
    ],
 ]

+# Hybrid includes the header because the boundaries of the table include it,
+# but stream/network don't include it.
+data_stream = data_hybrid[1:]
+
 data_stream_table_rotated = [
    [
        "Table 21  Current use of contraception by background characteristics"
@ -2074,6 +2082,11 @@ data_network_vertical_headers = [

 # Compared to network, hybrid detects additional sparse columns
 data_hybrid_vertical_headers = [
+    [
+        "", "", "", "", "", "STATE", "", "", "", "CONGRESSIONAL", "", "",
+        "", "", "LEGISLATIVE", "", "", "COUNTY", "", "COUNTY", "", "",
+        "County Commissioner", "", "", "", ""
+    ],
    [
        "",
        "",
--- a/tests/files/PIR_Prospetto.dOfferta.pdf
+++ b/tests/files/PIR_Prospetto.dOfferta.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -287,7 +287,7 @@ def test_network_layout_kwargs():

 # Hybrid parser
 def test_hybrid():
-    df = pd.DataFrame(data_stream)
+    df = pd.DataFrame(data_hybrid)

    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
@ -324,6 +324,19 @@ def test_hybrid_process_background():
    assert_frame_equal(df, tables[1].df)


+def test_hybrid_split_text():
+    df = pd.DataFrame(data_network_split_text)
+
+    filename = os.path.join(testdir, "tabula/m27.pdf")
+    tables = camelot.read_pdf(
+        filename,
+        flavor="hybrid",
+        columns=["72,95,209,327,442,529,566,606,683"],
+        split_text=True,
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
 # Lattice parser tests
 def test_lattice():
    df = pd.DataFrame(data_lattice)