Improve column detection for hybrid flavor

No longer rely on the mode but on the parsing analysis during network detection. Added unit test for complex table with vertical header and mixed horizontal / vertical text.
2020-04-29 11:46:40 -07:00 · 2020-04-29 11:46:40 -07:00 · c0903b8ca9
parent 04fc542dc3
commit c0903b8ca9
12 changed files with 666 additions and 135 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -6,7 +6,6 @@ from __future__ import division
 import copy
 import math
 import numpy as np
 import warnings
 from .base import TextBaseParser
 from ..core import (
@ -18,6 +17,7 @@ from ..core import (
 from ..utils import (
    bbox_from_str,
    text_in_bbox,
    textlines_overlapping_bbox,
    bbox_from_textlines,
    find_columns_coordinates,
    text_in_bbox_per_axis,
@ -321,11 +321,17 @@ class TextNetworks(TextAlignments):
        horizontal axis.
        """
-        # Find the textline with the highest alignment score
+        # Find the textline with the highest alignment score, with a tie break
        # to prefer textlines further down in the table.  Starting the search
        # from the table's bottom allows the algo to collect data on more cells
        # before going to the header, typically harder to parse.
        return max(
            self._textline_to_alignments.keys(),
            key=lambda textline:
-            self._textline_to_alignments[textline].alignment_score(),
+            (
                self._textline_to_alignments[textline].alignment_score(),
                -textline.y0
            ),
            default=None
        )
@ -566,12 +572,13 @@ class Hybrid(TextBaseParser):
        )
    def _generate_table_bbox(self):
        user_provided_bboxes = None
        if self.table_areas is not None:
-            table_bbox = {}
+            # User gave us table areas already.  We will use their coordinates
            # to find column anchors.
            user_provided_bboxes = []
            for area_str in self.table_areas:
-                table_bbox[bbox_from_str(area_str)] = None
+                user_provided_bboxes.append(bbox_from_str(area_str))
            self.table_bbox = table_bbox
            return
        # Take all the textlines that are not just spaces
        all_textlines = [
@ -593,59 +600,73 @@ class Hybrid(TextBaseParser):
            parse_details_bbox_searches = None
        while True:
-            text_network = TextNetworks()
+            # Find a bbox: either pulling from the user's or from the network
-            text_network.generate(textlines)
+            # algorithm.
            text_network._remove_unconnected_edges()
            gaps_hv = text_network._compute_plausible_gaps()
            if gaps_hv is None:
                return None
            # edge_tol instructions override the calculated vertical gap
            edge_tol_hv = (
                gaps_hv[0],
                gaps_hv[1] if self.edge_tol is None else self.edge_tol
            )
            bbox = text_network._build_bbox_candidate(
                edge_tol_hv,
                parse_details=parse_details_bbox_searches
            )
            if bbox is None:
                break
-            if parse_details_network_searches is not None:
+            # First look for the body of the table
-                # Preserve the current edge calculation for display debugging
+            bbox_body = None
-                parse_details_network_searches.append(
+            if user_provided_bboxes is not None:
-                    copy.deepcopy(text_network)
+                if len(user_provided_bboxes) > 0:
                    bbox_body = user_provided_bboxes.pop()
            else:
                text_network = TextNetworks()
                text_network.generate(textlines)
                text_network._remove_unconnected_edges()
                gaps_hv = text_network._compute_plausible_gaps()
                if gaps_hv is None:
                    return None
                # edge_tol instructions override the calculated vertical gap
                edge_tol_hv = (
                    gaps_hv[0],
                    gaps_hv[1] if self.edge_tol is None else self.edge_tol
                )
                bbox_body = text_network._build_bbox_candidate(
                    edge_tol_hv,
                    parse_details=parse_details_bbox_searches
                )
-            # Get all the textlines that are at least 50% in the box
+                if parse_details_network_searches is not None:
-            tls_in_bbox = text_in_bbox(bbox, textlines)
+                    # Preserve the current edge calculation for debugging
                    parse_details_network_searches.append(
                        copy.deepcopy(text_network)
                    )
-            # and expand the text box to fully contain them
+            if bbox_body is None:
-            bbox = bbox_from_textlines(tls_in_bbox)
+                break
-            # FRH: do we need to repeat this?
+            # Get all the textlines that overlap with the box, compute
-            # tls_in_bbox = text_in_bbox(bbox, textlines)
+            # columns
            tls_in_bbox = textlines_overlapping_bbox(bbox_body, textlines)
            cols_anchors = find_columns_coordinates(tls_in_bbox)
-            # Apply a heuristic to salvage headers which formatting might be
+            # Unless the user gave us strict bbox_body, try to find a header
-            # off compared to the rest of the table.
+            # above the body to build the full bbox.
-            expanded_bbox = search_header_from_body_bbox(
+            if user_provided_bboxes is not None:
-                bbox,
+                bbox_full = bbox_body
-                textlines,
+            else:
-                cols_anchors,
+                # Expand the text box to fully contain the tls we found
-                gaps_hv[1]
+                bbox_body = bbox_from_textlines(tls_in_bbox)
-            )
+
                # Apply a heuristic to salvage headers which formatting might
                # be off compared to the rest of the table.
                bbox_full = search_header_from_body_bbox(
                    bbox_body,
                    textlines,
                    cols_anchors,
                    gaps_hv[1]
                )
            table_parse = {
                "bbox_body": bbox_body,
                "cols_anchors": cols_anchors,
                "bbox_full": bbox_full
            }
            self.table_bbox[bbox_full] = table_parse
            if self.parse_details is not None:
                if "col_searches" not in self.parse_details:
                    self.parse_details["col_searches"] = []
-                self.parse_details["col_searches"].append({
+                self.parse_details["col_searches"].append(table_parse)
                    "core_bbox": bbox,
                    "cols_anchors": cols_anchors,
                    "expanded_bbox": expanded_bbox
                })
            self.table_bbox[expanded_bbox] = None
            # Remember what textlines we processed, and repeat
            for tl in tls_in_bbox:
@ -682,7 +703,6 @@ class Hybrid(TextBaseParser):
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
@ -695,53 +715,11 @@ class Hybrid(TextBaseParser):
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
-            # calculate mode of the list of number of elements in
+            parse_details = self.table_bbox[bbox]
-            # each row to guess the number of columns
+            col_anchors = parse_details["cols_anchors"]
-            ncols = max(set(elements), key=elements.count)
+            cols = list(map(
-            if ncols == 1:
+                lambda idx: [col_anchors[idx], col_anchors[idx + 1]],
-                # if mode is 1, the page usually contains not tables
+                range(0, len(col_anchors) - 1)
-                # but there can be cases where the list can be skewed,
+            ))
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -472,7 +472,7 @@ class PlotMethods():
        for box_id, col_search in enumerate(parse_details["col_searches"]):
            draw_labeled_bbox(
-                ax, col_search["expanded_bbox"],
+                ax, col_search["bbox_full"],
                "box body + header #{box_id}".format(
                    box_id=box_id
                ),
@ -481,7 +481,7 @@ class PlotMethods():
                label_pos="top,left"
            )
            draw_labeled_bbox(
-                ax, col_search["core_bbox"],
+                ax, col_search["bbox_body"],
                "box body #{box_id}".format(
                    box_id=box_id
                ),
@ -495,8 +495,8 @@ class PlotMethods():
                ax.plot(
                    [col_anchor, col_anchor],
                    [
-                        col_search["core_bbox"][1] - 10,
+                        col_search["bbox_body"][1] - 10,
-                        col_search["core_bbox"][3] + 10,
+                        col_search["bbox_body"][3] + 10,
                    ],
                    color="green"
                )
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -431,8 +431,36 @@ def bbox_from_str(bbox_str):
    )
 def textlines_overlapping_bbox(bbox, textlines):
    """Returns all text objects which overlap or are within a bounding box.
    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    textlines : List of PDFMiner text objects.
    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects.
    """
    (left, bottom, right, top) = bbox
    t_bbox = [
        t
        for t in textlines
        if ((left < t.x0 < right) or (left < t.x1 < right))
        and ((bottom < t.y0 < top) or (bottom < t.y1 < top))
    ]
    return t_bbox
 def text_in_bbox(bbox, text):
-    """Returns all text objects which lie at least 50% inside a bounding box.
+    """Returns all text objects which lie at least 50% inside a bounding box
    across both dimensions.
    Parameters
    ----------
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb
--- a/tests/data.py
+++ b/tests/data.py
@ -1629,6 +1629,453 @@ data_hybrid_two_tables_b_2 = [
 # Trimming the table for the test of hybrid, which doesn't include it.
 data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
 data_hybrid_vertical_headers = [
    [
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "Congress-",
        "Senator 36th",
        "Rep106th",
        "",
        "Reg. of",
        "Road",
        "",
        "",
        "",
        "Distri",
        "Dist",
        "",
        "",
    ],
    [
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "1st Dist",
        "Dist.",
        "Dist.",
        "",
        "Deeds",
        "",
        "Commission",
        "",
        "District #1",
        "ct #2",
        "#3",
        "",
        "Dist #4",
    ],
    [
        "",
        "",
        "",
        "",
        "",
        "Governor",
        "",
        "",
        "U.S. Senator",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "",
        "Number of Registered voters",
        "Poll Book Totals",
        "Brian Calley",
        "Patrick Colbeck",
        "Jim Hines",
        "Bill Schuette",
        "John James",
        "Sandy Pensler",
        "",
        "Jack Bergman",
        "",
        "Jim Stamas",
        "Sue Allor",
        "Melissa A. Cordes",
        "",
        "Al Scully",
        "",
        "Daniel G. Gauthier",
        "Craig M. Clemens",
        "Craig Johnston",
        "Carolyn Brummund",
        "Adam Brege",
        "David Bielusiak",
    ],
    [
        "Alcona",
        "963",
        "439",
        "55",
        "26",
        "47",
        "164",
        "173",
        "111",
        "",
        "268",
        "",
        "272",
        "275",
        "269",
        "",
        "271",
        "",
        "224",
        "76",
        "",
        "",
        "",
        "",
    ],
    [
        "Caledonia",
        "923",
        "393",
        "40",
        "23",
        "45",
        "158",
        "150",
        "103",
        "",
        "244",
        "",
        "247",
        "254",
        "255",
        "",
        "244",
        "",
        "139",
        "143",
        "",
        "",
        "",
        "",
    ],
    [
        "Curtis",
        "1026",
        "349",
        "30",
        "30",
        "25",
        "102",
        "95",
        "84",
        "",
        "159",
        "",
        "164",
        "162",
        "161",
        "",
        "157",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Greenbush",
        "1212",
        "423",
        "56",
        "26",
        "40",
        "126",
        "104",
        "131",
        "",
        "208",
        "",
        "213",
        "214",
        "215",
        "",
        "208",
        "",
        "",
        "",
        "",
        "208",
        "",
        "",
    ],
    [
        "Gustin",
        "611",
        "180",
        "22",
        "35",
        "17",
        "55",
        "73",
        "45",
        "",
        "108",
        "",
        "104",
        "111",
        "111",
        "",
        "109",
        "",
        "",
        "",
        "",
        "",
        "81",
        "42",
    ],
    [
        "Harrisville",
        "1142",
        "430",
        "45",
        "90",
        "29",
        "101",
        "155",
        "94",
        "",
        "226",
        "",
        "226",
        "232",
        "244",
        "",
        "226",
        "",
        "",
        "",
        "232",
        "",
        "",
        "",
    ],
    [
        "Hawes",
        "884",
        "293",
        "38",
        "36",
        "27",
        "109",
        "121",
        "84",
        "",
        "192",
        "",
        "195",
        "195",
        "193",
        "",
        "184",
        "",
        "",
        "",
        "",
        "",
        "118",
        "87",
    ],
    [
        "Haynes",
        "626",
        "275",
        "31",
        "20",
        "32",
        "104",
        "121",
        "53",
        "",
        "163",
        "",
        "163",
        "173",
        "161",
        "",
        "152",
        "",
        "",
        "",
        "76",
        "",
        "69",
        "31",
    ],
    [
        "Mikado",
        "781",
        "208",
        "19",
        "39",
        "17",
        "81",
        "90",
        "63",
        "",
        "149",
        "",
        "149",
        "145",
        "147",
        "",
        "143",
        "",
        "",
        "",
        "",
        "113",
        "",
        "",
    ],
    [
        "Millen",
        "353",
        "139",
        "7",
        "16",
        "13",
        "38",
        "49",
        "19",
        "",
        "62",
        "",
        "66",
        "67",
        "66",
        "",
        "62",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "Mitchell",
        "327",
        "96",
        "12",
        "17",
        "7",
        "29",
        "41",
        "17",
        "",
        "57",
        "",
        "55",
        "57",
        "60",
        "",
        "56",
        "",
        "",
        "",
        "",
        "",
        "",
        "",
    ],
    [
        "City Harrisville",
        "389",
        "171",
        "16",
        "15",
        "18",
        "35",
        "49",
        "31",
        "",
        "78",
        "",
        "80",
        "82",
        "81",
        "",
        "77",
        "",
        "",
        "",
        "73",
        "",
        "",
        "",
    ],
    [
        "Totals",
        "9237",
        "3396",
        "371",
        "373",
        "317",
        "1102",
        "1221",
        "835",
        "0",
        "1914",
        "0",
        "1934",
        "1967",
        "1963",
        "0",
        "1889",
        "0",
        "363",
        "219",
        "381",
        "321",
        "268",
        "160",
    ],
 ]
 data_stream_table_areas = [
    ["", "One Withholding"],
    ["Payroll Period", "Allowance"],
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_grid_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_grid_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/vertical_header.pdf
+++ b/tests/files/vertical_header.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -194,6 +194,17 @@ def test_hybrid_two_tables_b():
    assert df2.equals(tables[1].df)
 def test_hybrid_vertical_header():
    """Tests a complex table with a vertically text header.
    """
    df = pd.DataFrame(data_hybrid_vertical_headers)
    filename = os.path.join(testdir, "vertical_header.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")
    assert len(tables) == 1
    assert_frame_equal(df, tables[0].df)
 def test_hybrid_table_regions():
    df = pd.DataFrame(data_hybrid_table_regions)