Improve column detection for hybrid flavor

No longer rely on the mode but on the parsing analysis during network detection. Added unit test for complex table with vertical header and mixed horizontal / vertical text.
2020-04-29 11:46:40 -07:00 · 2020-04-29 11:46:40 -07:00 · ada4809a59
parent e31e978ebe
commit ada4809a59
12 changed files with 666 additions and 135 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -6,7 +6,6 @@ from __future__ import division
 import copy
 import math
 import numpy as np
-import warnings

 from .base import TextBaseParser
 from ..core import (
@ -18,6 +17,7 @@ from ..core import (
 from ..utils import (
    bbox_from_str,
    text_in_bbox,
+    textlines_overlapping_bbox,
    bbox_from_textlines,
    find_columns_coordinates,
    text_in_bbox_per_axis,
@ -321,11 +321,17 @@ class TextNetworks(TextAlignments):
        horizontal axis.

        """
-        # Find the textline with the highest alignment score
+        # Find the textline with the highest alignment score, with a tie break
+        # to prefer textlines further down in the table.  Starting the search
+        # from the table's bottom allows the algo to collect data on more cells
+        # before going to the header, typically harder to parse.
        return max(
            self._textline_to_alignments.keys(),
            key=lambda textline:
-            self._textline_to_alignments[textline].alignment_score(),
+            (
+                self._textline_to_alignments[textline].alignment_score(),
+                -textline.y0
+            ),
            default=None
        )

@ -566,12 +572,13 @@ class Hybrid(TextBaseParser):
        )

    def _generate_table_bbox(self):
+        user_provided_bboxes = None
        if self.table_areas is not None:
-            table_bbox = {}
+            # User gave us table areas already.  We will use their coordinates
+            # to find column anchors.
+            user_provided_bboxes = []
            for area_str in self.table_areas:
-                table_bbox[bbox_from_str(area_str)] = None
-            self.table_bbox = table_bbox
-            return
+                user_provided_bboxes.append(bbox_from_str(area_str))

        # Take all the textlines that are not just spaces
        all_textlines = [
@ -593,59 +600,73 @@ class Hybrid(TextBaseParser):
            parse_details_bbox_searches = None

        while True:
-            text_network = TextNetworks()
-            text_network.generate(textlines)
-            text_network._remove_unconnected_edges()
-            gaps_hv = text_network._compute_plausible_gaps()
-            if gaps_hv is None:
-                return None
-            # edge_tol instructions override the calculated vertical gap
-            edge_tol_hv = (
-                gaps_hv[0],
-                gaps_hv[1] if self.edge_tol is None else self.edge_tol
-            )
-            bbox = text_network._build_bbox_candidate(
-                edge_tol_hv,
-                parse_details=parse_details_bbox_searches
-            )
-            if bbox is None:
-                break
+            # Find a bbox: either pulling from the user's or from the network
+            # algorithm.

-            if parse_details_network_searches is not None:
-                # Preserve the current edge calculation for display debugging
-                parse_details_network_searches.append(
-                    copy.deepcopy(text_network)
+            # First look for the body of the table
+            bbox_body = None
+            if user_provided_bboxes is not None:
+                if len(user_provided_bboxes) > 0:
+                    bbox_body = user_provided_bboxes.pop()
+            else:
+                text_network = TextNetworks()
+                text_network.generate(textlines)
+                text_network._remove_unconnected_edges()
+                gaps_hv = text_network._compute_plausible_gaps()
+                if gaps_hv is None:
+                    return None
+                # edge_tol instructions override the calculated vertical gap
+                edge_tol_hv = (
+                    gaps_hv[0],
+                    gaps_hv[1] if self.edge_tol is None else self.edge_tol
+                )
+                bbox_body = text_network._build_bbox_candidate(
+                    edge_tol_hv,
+                    parse_details=parse_details_bbox_searches
                )

-            # Get all the textlines that are at least 50% in the box
-            tls_in_bbox = text_in_bbox(bbox, textlines)
+                if parse_details_network_searches is not None:
+                    # Preserve the current edge calculation for debugging
+                    parse_details_network_searches.append(
+                        copy.deepcopy(text_network)
+                    )

-            # and expand the text box to fully contain them
-            bbox = bbox_from_textlines(tls_in_bbox)
+            if bbox_body is None:
+                break

-            # FRH: do we need to repeat this?
-            # tls_in_bbox = text_in_bbox(bbox, textlines)
+            # Get all the textlines that overlap with the box, compute
+            # columns
+            tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
            cols_anchors = find_columns_coordinates(tls_in_bbox)

-            # Apply a heuristic to salvage headers which formatting might be
-            # off compared to the rest of the table.
-            expanded_bbox = search_header_from_body_bbox(
-                bbox,
-                textlines,
-                cols_anchors,
-                gaps_hv[1]
-            )
+            # Unless the user gave us strict bbox_body, try to find a header
+            # above the body to build the full bbox.
+            if user_provided_bboxes is not None:
+                bbox_full = bbox_body
+            else:
+                # Expand the text box to fully contain the tls we found
+                bbox_body = bbox_from_textlines(tls_in_bbox)
+
+                # Apply a heuristic to salvage headers which formatting might
+                # be off compared to the rest of the table.
+                bbox_full = search_header_from_body_bbox(
+                    bbox_body,
+                    textlines,
+                    cols_anchors,
+                    gaps_hv[1]
+                )
+
+            table_parse = {
+                "bbox_body": bbox_body,
+                "cols_anchors": cols_anchors,
+                "bbox_full": bbox_full
+            }
+            self.table_bbox[bbox_full] = table_parse

            if self.parse_details is not None:
                if "col_searches" not in self.parse_details:
                    self.parse_details["col_searches"] = []
-                self.parse_details["col_searches"].append({
-                    "core_bbox": bbox,
-                    "cols_anchors": cols_anchors,
-                    "expanded_bbox": expanded_bbox
-                })
-
-            self.table_bbox[expanded_bbox] = None
+                self.parse_details["col_searches"].append(table_parse)

            # Remember what textlines we processed, and repeat
            for tl in tls_in_bbox:
@ -682,7 +703,6 @@ class Hybrid(TextBaseParser):
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
-        elements = [len(r) for r in rows_grouped]

        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
@ -695,53 +715,11 @@ class Hybrid(TextBaseParser):
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
-            # calculate mode of the list of number of elements in
-            # each row to guess the number of columns
-            ncols = max(set(elements), key=elements.count)
-            if ncols == 1:
-                # if mode is 1, the page usually contains not tables
-                # but there can be cases where the list can be skewed,
-                # try to remove all 1s from list in this case and
-                # see if the list contains elements, if yes, then use
-                # the mode after removing 1s
-                elements = list(filter(lambda x: x != 1, elements))
-                if elements:
-                    ncols = max(set(elements), key=elements.count)
-                else:
-                    warnings.warn(
-                        "No tables found in table area {}"
-                        .format(table_idx + 1)
-                    )
-            cols = [
-                (t.x0, t.x1)
-                for r in rows_grouped
-                if len(r) == ncols
-                for t in r
-            ]
-            cols = self._merge_columns(
-                sorted(cols),
-                column_tol=self.column_tol
-            )
-            inner_text = []
-            for i in range(1, len(cols)):
-                left = cols[i - 1][1]
-                right = cols[i][0]
-                inner_text.extend(
-                    [
-                        t
-                        for direction in self.t_bbox
-                        for t in self.t_bbox[direction]
-                        if t.x0 > left and t.x1 < right
-                    ]
-                )
-            outer_text = [
-                t
-                for direction in self.t_bbox
-                for t in self.t_bbox[direction]
-                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
-            ]
-            inner_text.extend(outer_text)
-            cols = self._add_columns(cols, inner_text, self.row_tol)
-            cols = self._join_columns(cols, text_x_min, text_x_max)
+            parse_details = self.table_bbox[bbox]
+            col_anchors = parse_details["cols_anchors"]
+            cols = list(map(
+                lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
+                range(0, len(col_anchors) - 1)
+            ))

        return cols, rows, None, None
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -472,7 +472,7 @@ class PlotMethods():

        for box_id, col_search in enumerate(parse_details["col_searches"]):
            draw_labeled_bbox(
-                ax, col_search["expanded_bbox"],
+                ax, col_search["bbox_full"],
                "box body + header #{box_id}".format(
                    box_id=box_id
                ),
@ -481,7 +481,7 @@ class PlotMethods():
                label_pos="top,left"
            )
            draw_labeled_bbox(
-                ax, col_search["core_bbox"],
+                ax, col_search["bbox_body"],
                "box body #{box_id}".format(
                    box_id=box_id
                ),
@ -495,8 +495,8 @@ class PlotMethods():
                ax.plot(
                    [col_anchor, col_anchor],
                    [
-                        col_search["core_bbox"][1] - 10,
-                        col_search["core_bbox"][3] + 10,
+                        col_search["bbox_body"][1] - 10,
+                        col_search["bbox_body"][3] + 10,
                    ],
                    color="green"
                )
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -431,8 +431,36 @@ def bbox_from_str(bbox_str):
    )


+def textlines_overlapping_bbox(bbox, textlines):
+    """Returns all text objects which overlap or are within a bounding box.
+
+    Parameters
+    ----------
+    bbox : tuple
+        Tuple (x1, y1, x2, y2) representing a bounding box where
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
+        space.
+    textlines : List of PDFMiner text objects.
+
+    Returns
+    -------
+    t_bbox : list
+        List of PDFMiner text objects.
+
+    """
+    (left, bottom, right, top) = bbox
+    t_bbox = [
+        t
+        for t in textlines
+        if ((left < t.x0 < right) or (left < t.x1 < right))
+        and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
+    ]
+    return t_bbox
+
+
 def text_in_bbox(bbox, text):
-    """Returns all text objects which lie at least 50% inside a bounding box.
+    """Returns all text objects which lie at least 50% inside a bounding box
+    across both dimensions.

    Parameters
    ----------
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -1629,6 +1629,453 @@ data_hybrid_two_tables_b_2 = [
 # Trimming the table for the test of hybrid, which doesn't include it.
 data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]

+data_hybrid_vertical_headers = [
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "Congress-",
+        "Senator 36th",
+        "Rep106th",
+        "",
+        "Reg. of",
+        "Road",
+        "",
+        "",
+        "",
+        "Distri",
+        "Dist",
+        "",
+        "",
+    ],
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "1st Dist",
+        "Dist.",
+        "Dist.",
+        "",
+        "Deeds",
+        "",
+        "Commission",
+        "",
+        "District #1",
+        "ct #2",
+        "#3",
+        "",
+        "Dist #4",
+    ],
+    [
+        "",
+        "",
+        "",
+        "",
+        "",
+        "Governor",
+        "",
+        "",
+        "U.S. Senator",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "",
+        "Number of Registered voters",
+        "Poll Book Totals",
+        "Brian Calley",
+        "Patrick Colbeck",
+        "Jim Hines",
+        "Bill Schuette",
+        "John James",
+        "Sandy Pensler",
+        "",
+        "Jack Bergman",
+        "",
+        "Jim Stamas",
+        "Sue Allor",
+        "Melissa A. Cordes",
+        "",
+        "Al Scully",
+        "",
+        "Daniel G. Gauthier",
+        "Craig M. Clemens",
+        "Craig Johnston",
+        "Carolyn Brummund",
+        "Adam Brege",
+        "David Bielusiak",
+    ],
+    [
+        "Alcona",
+        "963",
+        "439",
+        "55",
+        "26",
+        "47",
+        "164",
+        "173",
+        "111",
+        "",
+        "268",
+        "",
+        "272",
+        "275",
+        "269",
+        "",
+        "271",
+        "",
+        "224",
+        "76",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Caledonia",
+        "923",
+        "393",
+        "40",
+        "23",
+        "45",
+        "158",
+        "150",
+        "103",
+        "",
+        "244",
+        "",
+        "247",
+        "254",
+        "255",
+        "",
+        "244",
+        "",
+        "139",
+        "143",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Curtis",
+        "1026",
+        "349",
+        "30",
+        "30",
+        "25",
+        "102",
+        "95",
+        "84",
+        "",
+        "159",
+        "",
+        "164",
+        "162",
+        "161",
+        "",
+        "157",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Greenbush",
+        "1212",
+        "423",
+        "56",
+        "26",
+        "40",
+        "126",
+        "104",
+        "131",
+        "",
+        "208",
+        "",
+        "213",
+        "214",
+        "215",
+        "",
+        "208",
+        "",
+        "",
+        "",
+        "",
+        "208",
+        "",
+        "",
+    ],
+    [
+        "Gustin",
+        "611",
+        "180",
+        "22",
+        "35",
+        "17",
+        "55",
+        "73",
+        "45",
+        "",
+        "108",
+        "",
+        "104",
+        "111",
+        "111",
+        "",
+        "109",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "81",
+        "42",
+    ],
+    [
+        "Harrisville",
+        "1142",
+        "430",
+        "45",
+        "90",
+        "29",
+        "101",
+        "155",
+        "94",
+        "",
+        "226",
+        "",
+        "226",
+        "232",
+        "244",
+        "",
+        "226",
+        "",
+        "",
+        "",
+        "232",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Hawes",
+        "884",
+        "293",
+        "38",
+        "36",
+        "27",
+        "109",
+        "121",
+        "84",
+        "",
+        "192",
+        "",
+        "195",
+        "195",
+        "193",
+        "",
+        "184",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "118",
+        "87",
+    ],
+    [
+        "Haynes",
+        "626",
+        "275",
+        "31",
+        "20",
+        "32",
+        "104",
+        "121",
+        "53",
+        "",
+        "163",
+        "",
+        "163",
+        "173",
+        "161",
+        "",
+        "152",
+        "",
+        "",
+        "",
+        "76",
+        "",
+        "69",
+        "31",
+    ],
+    [
+        "Mikado",
+        "781",
+        "208",
+        "19",
+        "39",
+        "17",
+        "81",
+        "90",
+        "63",
+        "",
+        "149",
+        "",
+        "149",
+        "145",
+        "147",
+        "",
+        "143",
+        "",
+        "",
+        "",
+        "",
+        "113",
+        "",
+        "",
+    ],
+    [
+        "Millen",
+        "353",
+        "139",
+        "7",
+        "16",
+        "13",
+        "38",
+        "49",
+        "19",
+        "",
+        "62",
+        "",
+        "66",
+        "67",
+        "66",
+        "",
+        "62",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Mitchell",
+        "327",
+        "96",
+        "12",
+        "17",
+        "7",
+        "29",
+        "41",
+        "17",
+        "",
+        "57",
+        "",
+        "55",
+        "57",
+        "60",
+        "",
+        "56",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    [
+        "City Harrisville",
+        "389",
+        "171",
+        "16",
+        "15",
+        "18",
+        "35",
+        "49",
+        "31",
+        "",
+        "78",
+        "",
+        "80",
+        "82",
+        "81",
+        "",
+        "77",
+        "",
+        "",
+        "",
+        "73",
+        "",
+        "",
+        "",
+    ],
+    [
+        "Totals",
+        "9237",
+        "3396",
+        "371",
+        "373",
+        "317",
+        "1102",
+        "1221",
+        "835",
+        "0",
+        "1914",
+        "0",
+        "1934",
+        "1967",
+        "1963",
+        "0",
+        "1889",
+        "0",
+        "363",
+        "219",
+        "381",
+        "321",
+        "268",
+        "160",
+    ],
+]
+
+
+
 data_stream_table_areas = [
    ["", "One Withholding"],
    ["Payroll Period", "Allowance"],
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_grid_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_grid_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/vertical_header.pdf
+++ b/tests/files/vertical_header.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -200,6 +200,17 @@ def test_hybrid_two_tables_b():
    assert df2.equals(tables[1].df)


+def test_hybrid_vertical_header():
+    """Tests a complex table with a vertically text header.
+    """
+    df = pd.DataFrame(data_hybrid_vertical_headers)
+
+    filename = os.path.join(testdir, "vertical_header.pdf")
+    tables = camelot.read_pdf(filename, flavor="hybrid")
+    assert len(tables) == 1
+    assert_frame_equal(df, tables[0].df)
+
+
 def test_hybrid_table_regions():
    df = pd.DataFrame(data_hybrid_table_regions)