Enforce text_edge as subcase of text_alignment

TextNetworks is a list of TextAlignments
2020-04-24 12:42:13 -07:00 · 2020-04-24 12:42:13 -07:00 · efe81292ca
parent 58b2c1d0fd
commit efe81292ca
2 changed files with 87 additions and 107 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -82,7 +82,6 @@ class TextAlignment(object):
        self.textlines.append(textline)


-
 class TextEdge(TextAlignment):
    """Defines a text edge coordinates relative to a left-bottom
    origin. (PDF coordinate space).
@ -102,19 +101,16 @@ class TextEdge(TextAlignment):

    Attributes
    ----------
-    intersections: int
-        Number of intersections with horizontal text rows.
    is_valid: bool
        A text edge is valid if it intersects with at least
        TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.

    """

-    def __init__(self, coord, textline, y0, y1, align):
+    def __init__(self, coord, textline, align):
        super().__init__(coord, textline, align)
-        self.y0 = y0
-        self.y1 = y1
-        self.intersections = 0
+        self.y0 = textline.y0
+        self.y1 = textline.y1
        self.is_valid = False

    def __repr__(self):
@ -133,10 +129,9 @@ class TextEdge(TextAlignment):
        if np.isclose(self.y0, textline.y0, atol=edge_tol):
            self.register_aligned_textline(textline, x)
            self.y0 = textline.y0
-            self.intersections += 1
            # a textedge is valid only if it extends uninterrupted
            # over a required number of textlines
-            if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
+            if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
                self.is_valid = True


@ -151,8 +146,8 @@ class TextAlignments(object):
            self._textedges[alignment_name] = []

    @staticmethod
-    def _create_new_text_edge(coord, textline, align):
-        return NotImplemented
+    def _create_new_text_alignment(coord, textline, align):
+        return TextAlignment(coord, textline, align)

    def _update_edge(self, edge, coord, textline):
        return NotImplemented
@ -181,7 +176,7 @@ class TextAlignments(object):
            else:
                idx_insert = idx_closest
            if idx_insert is not None:
-                new_edge = self._create_new_text_edge(
+                new_edge = self._create_new_text_alignment(
                    coord, textline, alignment
                )
                edge_array.insert(idx_insert, new_edge)
@ -198,15 +193,14 @@ class TextEdges(TextAlignments):
        self.edge_tol = edge_tol

    @staticmethod
-    def _create_new_text_edge(coord, textline, align):
-        y0 = textline.y0
-        y1 = textline.y1
-        return TextEdge(coord, textline, y0, y1, align)
+    def _create_new_text_alignment(coord, textline, align):
+        # In TextEdges, each alignment is a TextEdge
+        return TextEdge(coord, textline, align)

    def add(self, coord, textline, align):
        """Adds a new text edge to the current dict.
        """
-        te = self._create_new_text_edge(coord, textline, align)
+        te = self._create_new_text_alignment(coord, textline, align)
        self._textedges[align].append(te)

    def _update_edge(self, edge, coord, textline):
@ -227,15 +221,15 @@ class TextEdges(TextAlignments):
        """
        intersections_sum = {
            "left": sum(
-                te.intersections for te in self._textedges["left"]
+                len(te.textlines) for te in self._textedges["left"]
                if te.is_valid
            ),
            "right": sum(
-                te.intersections for te in self._textedges["right"]
+                len(te.textlines) for te in self._textedges["right"]
                if te.is_valid
            ),
            "middle": sum(
-                te.intersections for te in self._textedges["middle"]
+                len(te.textlines) for te in self._textedges["middle"]
                if te.is_valid
            ),
        }
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -8,10 +8,13 @@ import copy
 import warnings

 from .base import BaseParser
-from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS)
+from ..core import (
+    TextAlignments,
+    ALL_ALIGNMENTS,
+    HORIZONTAL_ALIGNMENTS,
+    VERTICAL_ALIGNMENTS
+)
 from ..utils import (
-    get_index_closest_point,
-    get_textline_coords,
    bbox_from_str,
    text_in_bbox,
    text_in_bbox_per_axis,
@ -137,76 +140,80 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
    return new_bbox


-class Alignments(object):
+class AlignmentCounter(object):
    """
-    Represent the number of textlines aligned with this one across each edge.
+    Represents all textlines aligned with a textline for each alignment.

-    A cell can be vertically aligned with others by having matching left,
+    A textline can be vertically aligned with others by having matching left,
    right, or middle edge, and horizontally aligned by having matching top,
    bottom, or center edge.

    """

    def __init__(self):
-        # Vertical alignments
-        self.left = 0
-        self.right = 0
-        self.middle = 0
-
-        # Horizontal alignments
-        self.bottom = 0
-        self.top = 0
-        self.center = 0
+        self.alignment_to_occurrences = {}
+        for alignment in ALL_ALIGNMENTS:
+            self.alignment_to_occurrences[alignment] = []

    def __getitem__(self, key):
-        return getattr(self, key)
+        return self.alignment_to_occurrences[key]

    def __setitem__(self, key, value):
-        return setattr(self, key, value)
+        self.alignment_to_occurrences[key] = value
+        return value
+
+    def max_alignments(self, alignment_ids=None):
+        """Get the alignment dimension with the max number of textlines.
+
+        """
+        alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
+        alignment_items = map(
+            lambda alignment_id: (
+                alignment_id,
+                self.alignment_to_occurrences[alignment_id]
+            ),
+            alignment_ids
+        )
+        return max(alignment_items, key=lambda item: len(item[1]))

    def max_v(self):
+        """Tuple (alignment_id, textlines) of largest vertical row.
+        """
+        # Note that the horizontal alignments (left, center, right) are aligned
+        # vertically in a column, so max_v is calculated by looking at
+        # horizontal alignments.
+        return self.max_alignments(HORIZONTAL_ALIGNMENTS)
+
+    def max_h(self):
+        """Tuple (alignment_id, textlines) of largest horizontal col.
+        """
+        return self.max_alignments(VERTICAL_ALIGNMENTS)
+
+    def max_v_count(self):
        """Returns the maximum number of alignments along
        one of the vertical axis (left/right/middle).
        """
-        return max(self.left, self.right, self.middle)
+        return len(self.max_v()[1])

-    def max_h(self):
+    def max_h_count(self):
        """Returns the maximum number of alignments along
        one of the horizontal axis (bottom/top/center).
        """
-        return max(self.bottom, self.top, self.center)
-
-    def max_v_edge_name(self):
-        """Returns the name of the vertical edge that has the
-        maximum number of alignments.
-        """
-        return max(
-            ["left", "right", "middle"],
-            key=lambda edge_name: self[edge_name]
-        )
-
-    def max_h_edge_name(self):
-        """Returns the name of the horizontal edge that has the
-        maximum number of alignments.
-        """
-        return max(
-            ["bottom", "top", "center"],
-            key=lambda edge_name: self[edge_name]
-        )
+        return len(self.max_h()[1])

    def alignment_score(self):
        """We define the alignment score of a textline as the product of the
        number of aligned elements - 1. The -1 is to avoid favoring
         singletons on a long line.
        """
-        return (self.max_v()-1) * (self.max_h()-1)
+        return (self.max_v_count()-1) * (self.max_h_count()-1)


-class TextEdges2(TextAlignments):
-    """Defines a dict of vertical (top, bottom, middle) and
-    horizontal (left, right, and middle) text alignments found on
-    the PDF page. The dict has three keys based on the alignments,
-    and each key's value is a list of camelot.core.TextEdge objects.
+class TextNetworks(TextAlignments):
+    """Text elements connected via both vertical (top, bottom, middle) and
+    horizontal (left, right, and middle) alignments found on the PDF page.
+    The alignment dict has six keys based on the hor/vert alignments,
+    and each key's value is a list of camelot.core.TextAlignment objects.
    """

    def __init__(self):
@ -219,10 +226,6 @@ class TextEdges2(TextAlignments):
        self.max_rows = None
        self.max_cols = None

-    @staticmethod
-    def _create_new_text_edge(coord, textline, align):
-        return TextAlignment(coord, textline, align)
-
    def _update_edge(self, edge, coord, textline):
        edge.register_aligned_textline(textline, coord)

@ -238,27 +241,27 @@ class TextEdges2(TextAlignments):
    def _compute_alignment_counts(self):
        """Build a dictionary textline -> alignment object.
        """
-        for edge_name, textedges in self._textedges.items():
+        for align_id, textedges in self._textedges.items():
            for textedge in textedges:
                for textline in textedge.textlines:
                    alignments = self._textlines_alignments.get(
                        textline, None)
                    if alignments is None:
-                        alignments = Alignments()
+                        alignments = AlignmentCounter()
                        self._textlines_alignments[textline] = alignments
-                    alignments[edge_name] = len(textedge.textlines)
+                    alignments[align_id] = textedge.textlines

        # Finally calculate the overall maximum number of rows/cols
        self.max_rows = max(
            map(
-                lambda alignments: alignments.max_h(),
+                lambda alignments: alignments.max_h_count(),
                self._textlines_alignments.values()
            ),
            default=0
        )
        self.max_cols = max(
            map(
-                lambda alignments: alignments.max_v(),
+                lambda alignments: alignments.max_v_count(),
                self._textlines_alignments.values()
            ),
            default=0
@ -271,10 +274,10 @@ class TextEdges2(TextAlignments):
        the core table.
        """
        h_gaps, v_gaps = [], []
-        for edge_name in self._textedges:
-            edge_array = self._textedges[edge_name]
+        for align_id in self._textedges:
+            edge_array = self._textedges[align_id]
            gaps = []
-            vertical = edge_name in ["left", "right", "middle"]
+            vertical = align_id in ["left", "right", "middle"]
            sort_function = (lambda tl: tl.y0) \
                if vertical \
                else (lambda tl: tl.x0)
@ -301,7 +304,7 @@ class TextEdges2(TextAlignments):
                rounded_gaps = list(map(lambda x: round(x, 2), gaps))
                print(
                    f"{direction_str} gaps found "
-                    f"for {edge_name}: "
+                    f"for {align_id}: "
                    f"{rounded_gaps} "
                    f"with {percentile}th percentile "
                    f"{np.percentile(gaps, percentile)}"
@ -316,15 +319,16 @@ class TextEdges2(TextAlignments):
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
-            for edge_type in self._textedges:
+            for alignment_id, textalignments in self._textedges.items():
                # For each alignment edge, remove items if they are singletons
                # either horizontally or vertically
-                for te in self._textedges[edge_type]:
-                    for i in range(len(te.textlines) - 1, -1, -1):
-                        tl = te.textlines[i]
+                for ta in textalignments:
+                    for i in range(len(ta.textlines) - 1, -1, -1):
+                        tl = ta.textlines[i]
                        alignments = self._textlines_alignments[tl]
-                        if alignments.max_h() <= 1 or alignments.max_v() <= 1:
-                            del te.textlines[i]
+                        if alignments.max_h_count() <= 1 or \
+                           alignments.max_v_count() <= 1:
+                            del ta.textlines[i]
                            removed_singletons = True
            self._textlines_alignments = {}
            self._compute_alignment_counts()
@ -360,37 +364,19 @@ class TextEdges2(TextAlignments):
        # It will serve as a reference axis along which to collect the average
        # spacing between rows/cols.
        most_aligned_tl = self._most_connected_textline()
-        most_aligned_coords = get_textline_coords(
-            most_aligned_tl)

        # Retrieve the list of textlines it's aligned with, across both
        # axis
        best_alignment = self._textlines_alignments[most_aligned_tl]
-        ref_h_edge_name = best_alignment.max_h_edge_name()
-        ref_v_edge_name = best_alignment.max_v_edge_name()
-        best_h_textedges = self._textedges[ref_h_edge_name]
-        best_v_textedges = self._textedges[ref_v_edge_name]
-        h_coord = most_aligned_coords[ref_h_edge_name]
-        v_coord = most_aligned_coords[ref_v_edge_name]
+        ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
        h_textlines = sorted(
-            best_h_textedges[
-                get_index_closest_point(
-                    h_coord,
-                    best_h_textedges,
-                    fn=lambda x: x.coord
-                )
-            ].textlines,
+            ref_h_textlines,
            key=lambda tl: tl.x0,
            reverse=True
        )
+        ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
        v_textlines = sorted(
-            best_v_textedges[
-                get_index_closest_point(
-                    v_coord,
-                    best_v_textedges,
-                    fn=lambda x: x.coord
-                )
-            ].textlines,
+            ref_v_textlines,
            key=lambda tl: tl.y0,
            reverse=True
        )
@ -517,7 +503,7 @@ class TextEdges2(TextAlignments):
            ax.text(
                tl.x0 - 5,
                tl.y0 - 5,
-                f"{alignments.max_h()}x{alignments.max_v()}",
+                f"{alignments.max_h_count()}x{alignments.max_v_count()}",
                fontsize=5,
                color="black"
            )
@ -826,7 +812,7 @@ class Hybrid(BaseParser):
            debug_info_bboxes_searches = None

        while True:
-            self.textedges = TextEdges2()
+            self.textedges = TextNetworks()
            self.textedges.generate(textlines)
            self.textedges._remove_unconnected_edges()
            if debug_info_edges_searches is not None: