Define TextEdge as a bounded TextAlignment
parent
3ea8d81900
commit
58b2c1d0fd
|
|
@ -28,9 +28,66 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
|
|||
TABLE_AREA_PADDING = 10
|
||||
|
||||
|
||||
class TextEdge(object):
|
||||
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
|
||||
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
||||
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
||||
|
||||
|
||||
class TextAlignment(object):
|
||||
"""Represents a list of textlines sharing an alignment on a coordinate.
|
||||
|
||||
The alignment can be left/right/middle or top/bottom/center.
|
||||
|
||||
(PDF coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coord : float
|
||||
coordinate of the initial text edge. Depending on the alignment
|
||||
it could be a vertical or horizontal coordinate.
|
||||
textline : obj
|
||||
the original textline to start the alignment
|
||||
align : str
|
||||
Name of the alignment (e.g. "left", "top", etc)
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coord : float
|
||||
The coordinate aligned averaged out across textlines. It can be along
|
||||
the x or y axis.
|
||||
textlines : array
|
||||
Array of textlines that demonstrate this alignment.
|
||||
align : str
|
||||
Name of the alignment (e.g. "left", "top", etc)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, coord, textline, align):
|
||||
self.coord = coord
|
||||
self.textlines = [textline]
|
||||
self.align = align
|
||||
|
||||
def __repr__(self):
|
||||
text_inside = " | ".join(
|
||||
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
||||
f"textlines text='{text_inside}...'>"
|
||||
|
||||
def register_aligned_textline(self, textline, coord):
|
||||
"""Update new textline to this alignment, adapting its average."""
|
||||
# Increase the intersections for this segment, expand it up,
|
||||
# and adjust the x based on the new value
|
||||
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||
float(len(self.textlines) + 1)
|
||||
self.textlines.append(textline)
|
||||
|
||||
|
||||
|
||||
class TextEdge(TextAlignment):
|
||||
"""Defines a text edge coordinates relative to a left-bottom
|
||||
origin. (PDF coordinate space)
|
||||
origin. (PDF coordinate space).
|
||||
|
||||
An edge is an alignment bounded over a segment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -53,11 +110,10 @@ class TextEdge(object):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, coord, y0, y1, align="left"):
|
||||
self.coord = coord
|
||||
def __init__(self, coord, textline, y0, y1, align):
|
||||
super().__init__(coord, textline, align)
|
||||
self.y0 = y0
|
||||
self.y1 = y1
|
||||
self.align = align
|
||||
self.intersections = 0
|
||||
self.is_valid = False
|
||||
|
||||
|
|
@ -70,14 +126,13 @@ class TextEdge(object):
|
|||
self.is_valid,
|
||||
)
|
||||
|
||||
def update_coords(self, x, y0, edge_tol=50):
|
||||
def update_coords(self, x, textline, edge_tol=50):
|
||||
"""Updates the text edge's x and bottom y coordinates and sets
|
||||
the is_valid attribute.
|
||||
"""
|
||||
if np.isclose(self.y0, y0, atol=edge_tol):
|
||||
self.coord = (self.intersections * self.coord + x) / \
|
||||
float(self.intersections + 1)
|
||||
self.y0 = y0
|
||||
if np.isclose(self.y0, textline.y0, atol=edge_tol):
|
||||
self.register_aligned_textline(textline, x)
|
||||
self.y0 = textline.y0
|
||||
self.intersections += 1
|
||||
# a textedge is valid only if it extends uninterrupted
|
||||
# over a required number of textlines
|
||||
|
|
@ -85,22 +140,18 @@ class TextEdge(object):
|
|||
self.is_valid = True
|
||||
|
||||
|
||||
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
|
||||
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
||||
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
||||
|
||||
|
||||
class BaseTextEdges(object):
|
||||
class TextAlignments(object):
|
||||
"""Defines a dict of text edges accross alignment references.
|
||||
"""
|
||||
|
||||
def __init__(self, alignment_names):
|
||||
# For each possible alignment, list of tuples coordinate/textlines
|
||||
self._textedges = {}
|
||||
for alignment_name in alignment_names:
|
||||
self._textedges[alignment_name] = []
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align=None):
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
return NotImplemented
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
|
|
@ -131,12 +182,12 @@ class BaseTextEdges(object):
|
|||
idx_insert = idx_closest
|
||||
if idx_insert is not None:
|
||||
new_edge = self._create_new_text_edge(
|
||||
coord, textline, align=alignment
|
||||
coord, textline, alignment
|
||||
)
|
||||
edge_array.insert(idx_insert, new_edge)
|
||||
|
||||
|
||||
class TextEdges(BaseTextEdges):
|
||||
class TextEdges(TextAlignments):
|
||||
"""Defines a dict of left, right and middle text edges found on
|
||||
the PDF page. The dict has three keys based on the alignments,
|
||||
and each key's value is a list of camelot.core.TextEdge objects.
|
||||
|
|
@ -147,19 +198,19 @@ class TextEdges(BaseTextEdges):
|
|||
self.edge_tol = edge_tol
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align=None):
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
y0 = textline.y0
|
||||
y1 = textline.y1
|
||||
return TextEdge(coord, y0, y1, align=align)
|
||||
return TextEdge(coord, textline, y0, y1, align)
|
||||
|
||||
def add(self, coord, textline, align):
|
||||
"""Adds a new text edge to the current dict.
|
||||
"""
|
||||
te = self._create_new_text_edge(coord, textline, align=align)
|
||||
te = self._create_new_text_edge(coord, textline, align)
|
||||
self._textedges[align].append(te)
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
edge.update_coords(coord, textline.y0, self.edge_tol)
|
||||
edge.update_coords(coord, textline, self.edge_tol)
|
||||
|
||||
def generate(self, textlines):
|
||||
"""Generates the text edges dict based on horizontal text
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import copy
|
|||
import warnings
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import (BaseTextEdges, ALL_ALIGNMENTS)
|
||||
from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS)
|
||||
from ..utils import (
|
||||
get_index_closest_point,
|
||||
get_textline_coords,
|
||||
|
|
@ -137,45 +137,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
return new_bbox
|
||||
|
||||
|
||||
class TextEdge2(object):
|
||||
"""Text edge coordinates relative to a left-bottom origin.
|
||||
|
||||
(PDF coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coord : float
|
||||
coordinate of the text edge. Depending on the alignment
|
||||
it could be a vertical or horizontal coordinate.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
textlines: array
|
||||
Array of textlines that demonstrate this alignment.
|
||||
coord: float
|
||||
The coordinate aligned averaged out across textlines.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, coord, textline):
|
||||
self.coord = coord
|
||||
self.textlines = [textline]
|
||||
|
||||
def __repr__(self):
|
||||
text_inside = " | ".join(
|
||||
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
||||
f"textlines text='{text_inside}...'>"
|
||||
|
||||
def register_aligned_textline(self, textline, coord):
|
||||
"""Update new textline to this alignment, adapting its average."""
|
||||
# Increase the intersections for this segment, expand it up,
|
||||
# and adjust the x based on the new value
|
||||
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||
float(len(self.textlines) + 1)
|
||||
self.textlines.append(textline)
|
||||
|
||||
|
||||
class Alignments(object):
|
||||
"""
|
||||
Represent the number of textlines aligned with this one across each edge.
|
||||
|
|
@ -241,7 +202,7 @@ class Alignments(object):
|
|||
return (self.max_v()-1) * (self.max_h()-1)
|
||||
|
||||
|
||||
class TextEdges2(BaseTextEdges):
|
||||
class TextEdges2(TextAlignments):
|
||||
"""Defines a dict of vertical (top, bottom, middle) and
|
||||
horizontal (left, right, and middle) text alignments found on
|
||||
the PDF page. The dict has three keys based on the alignments,
|
||||
|
|
@ -250,15 +211,6 @@ class TextEdges2(BaseTextEdges):
|
|||
|
||||
def __init__(self):
|
||||
super().__init__(ALL_ALIGNMENTS)
|
||||
# # For each possible alignment, list of tuples coordinate/textlines
|
||||
# self._textedges = {
|
||||
# "left": [],
|
||||
# "right": [],
|
||||
# "middle": [],
|
||||
# "bottom": [],
|
||||
# "top": [],
|
||||
# "center": []
|
||||
# }
|
||||
# For each textline, dictionary "edge type" to
|
||||
# "number of textlines aligned"
|
||||
self._textlines_alignments = {}
|
||||
|
|
@ -268,8 +220,8 @@ class TextEdges2(BaseTextEdges):
|
|||
self.max_cols = None
|
||||
|
||||
@staticmethod
|
||||
def _create_new_text_edge(coord, textline, align=None):
|
||||
return TextEdge2(coord, textline)
|
||||
def _create_new_text_edge(coord, textline, align):
|
||||
return TextAlignment(coord, textline, align)
|
||||
|
||||
def _update_edge(self, edge, coord, textline):
|
||||
edge.register_aligned_textline(textline, coord)
|
||||
|
|
|
|||
Loading…
Reference in New Issue