Enforce text_edge as subcase of text_alignment
TextNetworks is a list of TextAlignmentspull/153/head
parent
2d97fbc036
commit
22b6e33efa
|
|
@ -82,7 +82,6 @@ class TextAlignment(object):
|
|||
self.textlines.append(textline)
|
||||
|
||||
|
||||
|
||||
class TextEdge(TextAlignment):
|
||||
"""Defines a text edge coordinates relative to a left-bottom
|
||||
origin. (PDF coordinate space).
|
||||
|
|
@ -102,19 +101,16 @@ class TextEdge(TextAlignment):
|
|||
|
||||
Attributes
|
||||
----------
|
||||
intersections: int
|
||||
Number of intersections with horizontal text rows.
|
||||
is_valid: bool
|
||||
A text edge is valid if it intersects with at least
|
||||
TEXTEDGE_REQUIRED_ELEMENTS horizontal text rows.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, coord, textline, y0, y1, align):
|
||||
def __init__(self, coord, textline, align):
|
||||
super().__init__(coord, textline, align)
|
||||
self.y0 = y0
|
||||
self.y1 = y1
|
||||
self.intersections = 0
|
||||
self.y0 = textline.y0
|
||||
self.y1 = textline.y1
|
||||
self.is_valid = False
|
||||
|
||||
def __repr__(self):
|
||||
|
|
@ -133,10 +129,9 @@ class TextEdge(TextAlignment):
|
|||
if np.isclose(self.y0, textline.y0, atol=edge_tol):
|
||||
self.register_aligned_textline(textline, x)
|
||||
self.y0 = textline.y0
|
||||
self.intersections += 1
|
||||
# a textedge is valid only if it extends uninterrupted
|
||||
# over a required number of textlines
|
||||
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||
if len(self.textlines) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||
self.is_valid = True
|
||||
|
||||
|
||||
|
|
@ -151,8 +146,8 @@ class TextAlignments(object):
|
|||
self._textedges[alignment_name] = []
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
return NotImplemented
|
||||
def _create_new_text_alignment(coord, textline, align):
|
||||
return TextAlignment(coord, textline, align)
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
return NotImplemented
|
||||
|
|
@ -181,7 +176,7 @@ class TextAlignments(object):
|
|||
else:
|
||||
idx_insert = idx_closest
|
||||
if idx_insert is not None:
|
||||
new_edge = self._create_new_text_edge(
|
||||
new_edge = self._create_new_text_alignment(
|
||||
coord, textline, alignment
|
||||
)
|
||||
edge_array.insert(idx_insert, new_edge)
|
||||
|
|
@ -198,15 +193,14 @@ class TextEdges(TextAlignments):
|
|||
self.edge_tol = edge_tol
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
y0 = textline.y0
|
||||
y1 = textline.y1
|
||||
return TextEdge(coord, textline, y0, y1, align)
|
||||
def _create_new_text_alignment(coord, textline, align):
|
||||
# In TextEdges, each alignment is a TextEdge
|
||||
return TextEdge(coord, textline, align)
|
||||
|
||||
def add(self, coord, textline, align):
|
||||
"""Adds a new text edge to the current dict.
|
||||
"""
|
||||
te = self._create_new_text_edge(coord, textline, align)
|
||||
te = self._create_new_text_alignment(coord, textline, align)
|
||||
self._textedges[align].append(te)
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
|
|
@ -227,15 +221,15 @@ class TextEdges(TextAlignments):
|
|||
"""
|
||||
intersections_sum = {
|
||||
"left": sum(
|
||||
te.intersections for te in self._textedges["left"]
|
||||
len(te.textlines) for te in self._textedges["left"]
|
||||
if te.is_valid
|
||||
),
|
||||
"right": sum(
|
||||
te.intersections for te in self._textedges["right"]
|
||||
len(te.textlines) for te in self._textedges["right"]
|
||||
if te.is_valid
|
||||
),
|
||||
"middle": sum(
|
||||
te.intersections for te in self._textedges["middle"]
|
||||
len(te.textlines) for te in self._textedges["middle"]
|
||||
if te.is_valid
|
||||
),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,10 +8,13 @@ import copy
|
|||
import warnings
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS)
|
||||
from ..core import (
|
||||
TextAlignments,
|
||||
ALL_ALIGNMENTS,
|
||||
HORIZONTAL_ALIGNMENTS,
|
||||
VERTICAL_ALIGNMENTS
|
||||
)
|
||||
from ..utils import (
|
||||
get_index_closest_point,
|
||||
get_textline_coords,
|
||||
bbox_from_str,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis,
|
||||
|
|
@ -137,76 +140,80 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
return new_bbox
|
||||
|
||||
|
||||
class Alignments(object):
|
||||
class AlignmentCounter(object):
|
||||
"""
|
||||
Represent the number of textlines aligned with this one across each edge.
|
||||
Represents all textlines aligned with a textline for each alignment.
|
||||
|
||||
A cell can be vertically aligned with others by having matching left,
|
||||
A textline can be vertically aligned with others by having matching left,
|
||||
right, or middle edge, and horizontally aligned by having matching top,
|
||||
bottom, or center edge.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Vertical alignments
|
||||
self.left = 0
|
||||
self.right = 0
|
||||
self.middle = 0
|
||||
|
||||
# Horizontal alignments
|
||||
self.bottom = 0
|
||||
self.top = 0
|
||||
self.center = 0
|
||||
self.alignment_to_occurrences = {}
|
||||
for alignment in ALL_ALIGNMENTS:
|
||||
self.alignment_to_occurrences[alignment] = []
|
||||
|
||||
def __getitem__(self, key):
|
||||
return getattr(self, key)
|
||||
return self.alignment_to_occurrences[key]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
return setattr(self, key, value)
|
||||
self.alignment_to_occurrences[key] = value
|
||||
return value
|
||||
|
||||
def max_alignments(self, alignment_ids=None):
|
||||
"""Get the alignment dimension with the max number of textlines.
|
||||
|
||||
"""
|
||||
alignment_ids = alignment_ids or self.alignment_to_occurrences.keys()
|
||||
alignment_items = map(
|
||||
lambda alignment_id: (
|
||||
alignment_id,
|
||||
self.alignment_to_occurrences[alignment_id]
|
||||
),
|
||||
alignment_ids
|
||||
)
|
||||
return max(alignment_items, key=lambda item: len(item[1]))
|
||||
|
||||
def max_v(self):
|
||||
"""Tuple (alignment_id, textlines) of largest vertical row.
|
||||
"""
|
||||
# Note that the horizontal alignments (left, center, right) are aligned
|
||||
# vertically in a column, so max_v is calculated by looking at
|
||||
# horizontal alignments.
|
||||
return self.max_alignments(HORIZONTAL_ALIGNMENTS)
|
||||
|
||||
def max_h(self):
|
||||
"""Tuple (alignment_id, textlines) of largest horizontal col.
|
||||
"""
|
||||
return self.max_alignments(VERTICAL_ALIGNMENTS)
|
||||
|
||||
def max_v_count(self):
|
||||
"""Returns the maximum number of alignments along
|
||||
one of the vertical axis (left/right/middle).
|
||||
"""
|
||||
return max(self.left, self.right, self.middle)
|
||||
return len(self.max_v()[1])
|
||||
|
||||
def max_h(self):
|
||||
def max_h_count(self):
|
||||
"""Returns the maximum number of alignments along
|
||||
one of the horizontal axis (bottom/top/center).
|
||||
"""
|
||||
return max(self.bottom, self.top, self.center)
|
||||
|
||||
def max_v_edge_name(self):
|
||||
"""Returns the name of the vertical edge that has the
|
||||
maximum number of alignments.
|
||||
"""
|
||||
return max(
|
||||
["left", "right", "middle"],
|
||||
key=lambda edge_name: self[edge_name]
|
||||
)
|
||||
|
||||
def max_h_edge_name(self):
|
||||
"""Returns the name of the horizontal edge that has the
|
||||
maximum number of alignments.
|
||||
"""
|
||||
return max(
|
||||
["bottom", "top", "center"],
|
||||
key=lambda edge_name: self[edge_name]
|
||||
)
|
||||
return len(self.max_h()[1])
|
||||
|
||||
def alignment_score(self):
|
||||
"""We define the alignment score of a textline as the product of the
|
||||
number of aligned elements - 1. The -1 is to avoid favoring
|
||||
singletons on a long line.
|
||||
"""
|
||||
return (self.max_v()-1) * (self.max_h()-1)
|
||||
return (self.max_v_count()-1) * (self.max_h_count()-1)
|
||||
|
||||
|
||||
class TextEdges2(TextAlignments):
|
||||
"""Defines a dict of vertical (top, bottom, middle) and
|
||||
horizontal (left, right, and middle) text alignments found on
|
||||
the PDF page. The dict has three keys based on the alignments,
|
||||
and each key's value is a list of camelot.core.TextEdge objects.
|
||||
class TextNetworks(TextAlignments):
|
||||
"""Text elements connected via both vertical (top, bottom, middle) and
|
||||
horizontal (left, right, and middle) alignments found on the PDF page.
|
||||
The alignment dict has six keys based on the hor/vert alignments,
|
||||
and each key's value is a list of camelot.core.TextAlignment objects.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
|
@ -219,10 +226,6 @@ class TextEdges2(TextAlignments):
|
|||
self.max_rows = None
|
||||
self.max_cols = None
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
return TextAlignment(coord, textline, align)
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
edge.register_aligned_textline(textline, coord)
|
||||
|
||||
|
|
@ -238,27 +241,27 @@ class TextEdges2(TextAlignments):
|
|||
def _compute_alignment_counts(self):
|
||||
"""Build a dictionary textline -> alignment object.
|
||||
"""
|
||||
for edge_name, textedges in self._textedges.items():
|
||||
for align_id, textedges in self._textedges.items():
|
||||
for textedge in textedges:
|
||||
for textline in textedge.textlines:
|
||||
alignments = self._textlines_alignments.get(
|
||||
textline, None)
|
||||
if alignments is None:
|
||||
alignments = Alignments()
|
||||
alignments = AlignmentCounter()
|
||||
self._textlines_alignments[textline] = alignments
|
||||
alignments[edge_name] = len(textedge.textlines)
|
||||
alignments[align_id] = textedge.textlines
|
||||
|
||||
# Finally calculate the overall maximum number of rows/cols
|
||||
self.max_rows = max(
|
||||
map(
|
||||
lambda alignments: alignments.max_h(),
|
||||
lambda alignments: alignments.max_h_count(),
|
||||
self._textlines_alignments.values()
|
||||
),
|
||||
default=0
|
||||
)
|
||||
self.max_cols = max(
|
||||
map(
|
||||
lambda alignments: alignments.max_v(),
|
||||
lambda alignments: alignments.max_v_count(),
|
||||
self._textlines_alignments.values()
|
||||
),
|
||||
default=0
|
||||
|
|
@ -271,10 +274,10 @@ class TextEdges2(TextAlignments):
|
|||
the core table.
|
||||
"""
|
||||
h_gaps, v_gaps = [], []
|
||||
for edge_name in self._textedges:
|
||||
edge_array = self._textedges[edge_name]
|
||||
for align_id in self._textedges:
|
||||
edge_array = self._textedges[align_id]
|
||||
gaps = []
|
||||
vertical = edge_name in ["left", "right", "middle"]
|
||||
vertical = align_id in ["left", "right", "middle"]
|
||||
sort_function = (lambda tl: tl.y0) \
|
||||
if vertical \
|
||||
else (lambda tl: tl.x0)
|
||||
|
|
@ -301,7 +304,7 @@ class TextEdges2(TextAlignments):
|
|||
rounded_gaps = list(map(lambda x: round(x, 2), gaps))
|
||||
print(
|
||||
f"{direction_str} gaps found "
|
||||
f"for {edge_name}: "
|
||||
f"for {align_id}: "
|
||||
f"{rounded_gaps} "
|
||||
f"with {percentile}th percentile "
|
||||
f"{np.percentile(gaps, percentile)}"
|
||||
|
|
@ -316,15 +319,16 @@ class TextEdges2(TextAlignments):
|
|||
removed_singletons = True
|
||||
while removed_singletons:
|
||||
removed_singletons = False
|
||||
for edge_type in self._textedges:
|
||||
for alignment_id, textalignments in self._textedges.items():
|
||||
# For each alignment edge, remove items if they are singletons
|
||||
# either horizontally or vertically
|
||||
for te in self._textedges[edge_type]:
|
||||
for i in range(len(te.textlines) - 1, -1, -1):
|
||||
tl = te.textlines[i]
|
||||
for ta in textalignments:
|
||||
for i in range(len(ta.textlines) - 1, -1, -1):
|
||||
tl = ta.textlines[i]
|
||||
alignments = self._textlines_alignments[tl]
|
||||
if alignments.max_h() <= 1 or alignments.max_v() <= 1:
|
||||
del te.textlines[i]
|
||||
if alignments.max_h_count() <= 1 or \
|
||||
alignments.max_v_count() <= 1:
|
||||
del ta.textlines[i]
|
||||
removed_singletons = True
|
||||
self._textlines_alignments = {}
|
||||
self._compute_alignment_counts()
|
||||
|
|
@ -360,37 +364,19 @@ class TextEdges2(TextAlignments):
|
|||
# It will serve as a reference axis along which to collect the average
|
||||
# spacing between rows/cols.
|
||||
most_aligned_tl = self._most_connected_textline()
|
||||
most_aligned_coords = get_textline_coords(
|
||||
most_aligned_tl)
|
||||
|
||||
# Retrieve the list of textlines it's aligned with, across both
|
||||
# axis
|
||||
best_alignment = self._textlines_alignments[most_aligned_tl]
|
||||
ref_h_edge_name = best_alignment.max_h_edge_name()
|
||||
ref_v_edge_name = best_alignment.max_v_edge_name()
|
||||
best_h_textedges = self._textedges[ref_h_edge_name]
|
||||
best_v_textedges = self._textedges[ref_v_edge_name]
|
||||
h_coord = most_aligned_coords[ref_h_edge_name]
|
||||
v_coord = most_aligned_coords[ref_v_edge_name]
|
||||
ref_h_alignment_id, ref_h_textlines = best_alignment.max_h()
|
||||
h_textlines = sorted(
|
||||
best_h_textedges[
|
||||
get_index_closest_point(
|
||||
h_coord,
|
||||
best_h_textedges,
|
||||
fn=lambda x: x.coord
|
||||
)
|
||||
].textlines,
|
||||
ref_h_textlines,
|
||||
key=lambda tl: tl.x0,
|
||||
reverse=True
|
||||
)
|
||||
ref_v_alignment_id, ref_v_textlines = best_alignment.max_v()
|
||||
v_textlines = sorted(
|
||||
best_v_textedges[
|
||||
get_index_closest_point(
|
||||
v_coord,
|
||||
best_v_textedges,
|
||||
fn=lambda x: x.coord
|
||||
)
|
||||
].textlines,
|
||||
ref_v_textlines,
|
||||
key=lambda tl: tl.y0,
|
||||
reverse=True
|
||||
)
|
||||
|
|
@ -517,7 +503,7 @@ class TextEdges2(TextAlignments):
|
|||
ax.text(
|
||||
tl.x0 - 5,
|
||||
tl.y0 - 5,
|
||||
f"{alignments.max_h()}x{alignments.max_v()}",
|
||||
f"{alignments.max_h_count()}x{alignments.max_v_count()}",
|
||||
fontsize=5,
|
||||
color="black"
|
||||
)
|
||||
|
|
@ -826,7 +812,7 @@ class Hybrid(BaseParser):
|
|||
debug_info_bboxes_searches = None
|
||||
|
||||
while True:
|
||||
self.textedges = TextEdges2()
|
||||
self.textedges = TextNetworks()
|
||||
self.textedges.generate(textlines)
|
||||
self.textedges._remove_unconnected_edges()
|
||||
if debug_info_edges_searches is not None:
|
||||
|
|
|
|||
Loading…
Reference in New Issue