Enforce text_edge as subcase of text_alignment

TextNetworks is a list of TextAlignments
pull/153/head
Frh 2020-04-24 12:42:13 -07:00
parent 58b2c1d0fd
commit efe81292ca
2 changed files with 87 additions and 107 deletions

View File

@ -82,7 +82,6 @@ class TextAlignment(object):
self.textlines.append(textline) self.textlines.append(textline)
class TextEdge(TextAlignment): class TextEdge(TextAlignment):
"""Defines a text edge coordinates relative to a left-bottom """Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space). origin. (PDF coordinate space).
@ -102,19 +101,16 @@ class TextEdge(TextAlignment):
Attributes Attributes
---------- ----------
intersections: int
Number of intersections with horizontal text rows.
is_valid: bool is_valid: bool
A text edge is valid if it intersects with at least A text edge is valid if it intersects with at least
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows. TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
""" """
def __init__(self, coord, textline, y0, y1, align): def __init__(self, coord, textline, align):
super().__init__(coord, textline, align) super().__init__(coord, textline, align)
self.y0 = y0 self.y0 = textline.y0
self.y1 = y1 self.y1 = textline.y1
self.intersections = 0
self.is_valid = False self.is_valid = False
def __repr__(self): def __repr__(self):
@ -133,10 +129,9 @@ class TextEdge(TextAlignment):
if np.isclose(self.y0, textline.y0, atol=edge_tol): if np.isclose(self.y0, textline.y0, atol=edge_tol):
self.register_aligned_textline(textline, x) self.register_aligned_textline(textline, x)
self.y0 = textline.y0 self.y0 = textline.y0
self.intersections += 1
# a textedge is valid only if it extends uninterrupted # a textedge is valid only if it extends uninterrupted
# over a required number of textlines # over a required number of textlines
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True self.is_valid = True
@ -151,8 +146,8 @@ class TextAlignments(object):
self._textedges[alignment_name] = [] self._textedges[alignment_name] = []
@staticmethod @staticmethod
def _create_new_text_edge(coord, textline, align): def _create_new_text_alignment(coord, textline, align):
return NotImplemented return TextAlignment(coord, textline, align)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
return NotImplemented return NotImplemented
@ -181,7 +176,7 @@ class TextAlignments(object):
else: else:
idx_insert = idx_closest idx_insert = idx_closest
if idx_insert is not None: if idx_insert is not None:
new_edge = self._create_new_text_edge( new_edge = self._create_new_text_alignment(
coord, textline, alignment coord, textline, alignment
) )
edge_array.insert(idx_insert, new_edge) edge_array.insert(idx_insert, new_edge)
@ -198,15 +193,14 @@ class TextEdges(TextAlignments):
self.edge_tol = edge_tol self.edge_tol = edge_tol
@staticmethod @staticmethod
def _create_new_text_edge(coord, textline, align): def _create_new_text_alignment(coord, textline, align):
y0 = textline.y0 # In TextEdges, each alignment is a TextEdge
y1 = textline.y1 return TextEdge(coord, textline, align)
return TextEdge(coord, textline, y0, y1, align)
def add(self, coord, textline, align): def add(self, coord, textline, align):
"""Adds a new text edge to the current dict. """Adds a new text edge to the current dict.
""" """
te = self._create_new_text_edge(coord, textline, align) te = self._create_new_text_alignment(coord, textline, align)
self._textedges[align].append(te) self._textedges[align].append(te)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
@ -227,15 +221,15 @@ class TextEdges(TextAlignments):
""" """
intersections_sum = { intersections_sum = {
"left": sum( "left": sum(
te.intersections for te in self._textedges["left"] len(te.textlines) for te in self._textedges["left"]
if te.is_valid if te.is_valid
), ),
"right": sum( "right": sum(
te.intersections for te in self._textedges["right"] len(te.textlines) for te in self._textedges["right"]
if te.is_valid if te.is_valid
), ),
"middle": sum( "middle": sum(
te.intersections for te in self._textedges["middle"] len(te.textlines) for te in self._textedges["middle"]
if te.is_valid if te.is_valid
), ),
} }

View File

@ -8,10 +8,13 @@ import copy
import warnings import warnings
from .base import BaseParser from .base import BaseParser
from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS) from ..core import (
TextAlignments,
ALL_ALIGNMENTS,
HORIZONTAL_ALIGNMENTS,
VERTICAL_ALIGNMENTS
)
from ..utils import ( from ..utils import (
get_index_closest_point,
get_textline_coords,
bbox_from_str, bbox_from_str,
text_in_bbox, text_in_bbox,
text_in_bbox_per_axis, text_in_bbox_per_axis,
@ -137,76 +140,80 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
return new_bbox return new_bbox
class Alignments(object): class AlignmentCounter(object):
""" """
Represent the number of textlines aligned with this one across each edge. Represents all textlines aligned with a textline for each alignment.
A cell can be vertically aligned with others by having matching left, A textline can be vertically aligned with others by having matching left,
right, or middle edge, and horizontally aligned by having matching top, right, or middle edge, and horizontally aligned by having matching top,
bottom, or center edge. bottom, or center edge.
""" """
def __init__(self): def __init__(self):
# Vertical alignments self.alignment_to_occurrences = {}
self.left = 0 for alignment in ALL_ALIGNMENTS:
self.right = 0 self.alignment_to_occurrences[alignment] = []
self.middle = 0
# Horizontal alignments
self.bottom = 0
self.top = 0
self.center = 0
def __getitem__(self, key): def __getitem__(self, key):
return getattr(self, key) return self.alignment_to_occurrences[key]
def __setitem__(self, key, value): def __setitem__(self, key, value):
return setattr(self, key, value) self.alignment_to_occurrences[key] = value
return value
def max_alignments(self, alignment_ids=None):
"""Get the alignment dimension with the max number of textlines.
"""
alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
alignment_items = map(
lambda alignment_id: (
alignment_id,
self.alignment_to_occurrences[alignment_id]
),
alignment_ids
)
return max(alignment_items, key=lambda item: len(item[1]))
def max_v(self): def max_v(self):
"""Tuple (alignment_id, textlines) of largest vertical row.
"""
# Note that the horizontal alignments (left, center, right) are aligned
# vertically in a column, so max_v is calculated by looking at
# horizontal alignments.
return self.max_alignments(HORIZONTAL_ALIGNMENTS)
def max_h(self):
"""Tuple (alignment_id, textlines) of largest horizontal col.
"""
return self.max_alignments(VERTICAL_ALIGNMENTS)
def max_v_count(self):
"""Returns the maximum number of alignments along """Returns the maximum number of alignments along
one of the vertical axis (left/right/middle). one of the vertical axis (left/right/middle).
""" """
return max(self.left, self.right, self.middle) return len(self.max_v()[1])
def max_h(self): def max_h_count(self):
"""Returns the maximum number of alignments along """Returns the maximum number of alignments along
one of the horizontal axis (bottom/top/center). one of the horizontal axis (bottom/top/center).
""" """
return max(self.bottom, self.top, self.center) return len(self.max_h()[1])
def max_v_edge_name(self):
"""Returns the name of the vertical edge that has the
maximum number of alignments.
"""
return max(
["left", "right", "middle"],
key=lambda edge_name: self[edge_name]
)
def max_h_edge_name(self):
"""Returns the name of the horizontal edge that has the
maximum number of alignments.
"""
return max(
["bottom", "top", "center"],
key=lambda edge_name: self[edge_name]
)
def alignment_score(self): def alignment_score(self):
"""We define the alignment score of a textline as the product of the """We define the alignment score of a textline as the product of the
number of aligned elements - 1. The -1 is to avoid favoring number of aligned elements - 1. The -1 is to avoid favoring
singletons on a long line. singletons on a long line.
""" """
return (self.max_v()-1) * (self.max_h()-1) return (self.max_v_count()-1) * (self.max_h_count()-1)
class TextEdges2(TextAlignments): class TextNetworks(TextAlignments):
"""Defines a dict of vertical (top, bottom, middle) and """Text elements connected via both vertical (top, bottom, middle) and
horizontal (left, right, and middle) text alignments found on horizontal (left, right, and middle) alignments found on the PDF page.
the PDF page. The dict has three keys based on the alignments, The alignment dict has six keys based on the hor/vert alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextAlignment objects.
""" """
def __init__(self): def __init__(self):
@ -219,10 +226,6 @@ class TextEdges2(TextAlignments):
self.max_rows = None self.max_rows = None
self.max_cols = None self.max_cols = None
@staticmethod
def _create_new_text_edge(coord, textline, align):
return TextAlignment(coord, textline, align)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
edge.register_aligned_textline(textline, coord) edge.register_aligned_textline(textline, coord)
@ -238,27 +241,27 @@ class TextEdges2(TextAlignments):
def _compute_alignment_counts(self): def _compute_alignment_counts(self):
"""Build a dictionary textline -> alignment object. """Build a dictionary textline -> alignment object.
""" """
for edge_name, textedges in self._textedges.items(): for align_id, textedges in self._textedges.items():
for textedge in textedges: for textedge in textedges:
for textline in textedge.textlines: for textline in textedge.textlines:
alignments = self._textlines_alignments.get( alignments = self._textlines_alignments.get(
textline, None) textline, None)
if alignments is None: if alignments is None:
alignments = Alignments() alignments = AlignmentCounter()
self._textlines_alignments[textline] = alignments self._textlines_alignments[textline] = alignments
alignments[edge_name] = len(textedge.textlines) alignments[align_id] = textedge.textlines
# Finally calculate the overall maximum number of rows/cols # Finally calculate the overall maximum number of rows/cols
self.max_rows = max( self.max_rows = max(
map( map(
lambda alignments: alignments.max_h(), lambda alignments: alignments.max_h_count(),
self._textlines_alignments.values() self._textlines_alignments.values()
), ),
default=0 default=0
) )
self.max_cols = max( self.max_cols = max(
map( map(
lambda alignments: alignments.max_v(), lambda alignments: alignments.max_v_count(),
self._textlines_alignments.values() self._textlines_alignments.values()
), ),
default=0 default=0
@ -271,10 +274,10 @@ class TextEdges2(TextAlignments):
the core table. the core table.
""" """
h_gaps, v_gaps = [], [] h_gaps, v_gaps = [], []
for edge_name in self._textedges: for align_id in self._textedges:
edge_array = self._textedges[edge_name] edge_array = self._textedges[align_id]
gaps = [] gaps = []
vertical = edge_name in ["left", "right", "middle"] vertical = align_id in ["left", "right", "middle"]
sort_function = (lambda tl: tl.y0) \ sort_function = (lambda tl: tl.y0) \
if vertical \ if vertical \
else (lambda tl: tl.x0) else (lambda tl: tl.x0)
@ -301,7 +304,7 @@ class TextEdges2(TextAlignments):
rounded_gaps = list(map(lambda x: round(x, 2), gaps)) rounded_gaps = list(map(lambda x: round(x, 2), gaps))
print( print(
f"{direction_str} gaps found " f"{direction_str} gaps found "
f"for {edge_name}: " f"for {align_id}: "
f"{rounded_gaps} " f"{rounded_gaps} "
f"with {percentile}th percentile " f"with {percentile}th percentile "
f"{np.percentile(gaps, percentile)}" f"{np.percentile(gaps, percentile)}"
@ -316,15 +319,16 @@ class TextEdges2(TextAlignments):
removed_singletons = True removed_singletons = True
while removed_singletons: while removed_singletons:
removed_singletons = False removed_singletons = False
for edge_type in self._textedges: for alignment_id, textalignments in self._textedges.items():
# For each alignment edge, remove items if they are singletons # For each alignment edge, remove items if they are singletons
# either horizontally or vertically # either horizontally or vertically
for te in self._textedges[edge_type]: for ta in textalignments:
for i in range(len(te.textlines) - 1, -1, -1): for i in range(len(ta.textlines) - 1, -1, -1):
tl = te.textlines[i] tl = ta.textlines[i]
alignments = self._textlines_alignments[tl] alignments = self._textlines_alignments[tl]
if alignments.max_h() <= 1 or alignments.max_v() <= 1: if alignments.max_h_count() <= 1 or \
del te.textlines[i] alignments.max_v_count() <= 1:
del ta.textlines[i]
removed_singletons = True removed_singletons = True
self._textlines_alignments = {} self._textlines_alignments = {}
self._compute_alignment_counts() self._compute_alignment_counts()
@ -360,37 +364,19 @@ class TextEdges2(TextAlignments):
# It will serve as a reference axis along which to collect the average # It will serve as a reference axis along which to collect the average
# spacing between rows/cols. # spacing between rows/cols.
most_aligned_tl = self._most_connected_textline() most_aligned_tl = self._most_connected_textline()
most_aligned_coords = get_textline_coords(
most_aligned_tl)
# Retrieve the list of textlines it's aligned with, across both # Retrieve the list of textlines it's aligned with, across both
# axis # axis
best_alignment = self._textlines_alignments[most_aligned_tl] best_alignment = self._textlines_alignments[most_aligned_tl]
ref_h_edge_name = best_alignment.max_h_edge_name() ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
ref_v_edge_name = best_alignment.max_v_edge_name()
best_h_textedges = self._textedges[ref_h_edge_name]
best_v_textedges = self._textedges[ref_v_edge_name]
h_coord = most_aligned_coords[ref_h_edge_name]
v_coord = most_aligned_coords[ref_v_edge_name]
h_textlines = sorted( h_textlines = sorted(
best_h_textedges[ ref_h_textlines,
get_index_closest_point(
h_coord,
best_h_textedges,
fn=lambda x: x.coord
)
].textlines,
key=lambda tl: tl.x0, key=lambda tl: tl.x0,
reverse=True reverse=True
) )
ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
v_textlines = sorted( v_textlines = sorted(
best_v_textedges[ ref_v_textlines,
get_index_closest_point(
v_coord,
best_v_textedges,
fn=lambda x: x.coord
)
].textlines,
key=lambda tl: tl.y0, key=lambda tl: tl.y0,
reverse=True reverse=True
) )
@ -517,7 +503,7 @@ class TextEdges2(TextAlignments):
ax.text( ax.text(
tl.x0 - 5, tl.x0 - 5,
tl.y0 - 5, tl.y0 - 5,
f"{alignments.max_h()}x{alignments.max_v()}", f"{alignments.max_h_count()}x{alignments.max_v_count()}",
fontsize=5, fontsize=5,
color="black" color="black"
) )
@ -826,7 +812,7 @@ class Hybrid(BaseParser):
debug_info_bboxes_searches = None debug_info_bboxes_searches = None
while True: while True:
self.textedges = TextEdges2() self.textedges = TextNetworks()
self.textedges.generate(textlines) self.textedges.generate(textlines)
self.textedges._remove_unconnected_edges() self.textedges._remove_unconnected_edges()
if debug_info_edges_searches is not None: if debug_info_edges_searches is not None: