Define TextEdge as a bounded TextAlignment
parent
0b8aac977a
commit
2d97fbc036
|
|
@ -28,9 +28,66 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
||||||
class TextEdge(object):
|
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
|
||||||
|
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
||||||
|
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
||||||
|
|
||||||
|
|
||||||
|
class TextAlignment(object):
|
||||||
|
"""Represents a list of textlines sharing an alignment on a coordinate.
|
||||||
|
|
||||||
|
The alignment can be left/right/middle or top/bottom/center.
|
||||||
|
|
||||||
|
(PDF coordinate space)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
coord : float
|
||||||
|
coordinate of the initial text edge. Depending on the alignment
|
||||||
|
it could be a vertical or horizontal coordinate.
|
||||||
|
textline : obj
|
||||||
|
the original textline to start the alignment
|
||||||
|
align : str
|
||||||
|
Name of the alignment (e.g. "left", "top", etc)
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
coord : float
|
||||||
|
The coordinate aligned averaged out across textlines. It can be along
|
||||||
|
the x or y axis.
|
||||||
|
textlines : array
|
||||||
|
Array of textlines that demonstrate this alignment.
|
||||||
|
align : str
|
||||||
|
Name of the alignment (e.g. "left", "top", etc)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, coord, textline, align):
|
||||||
|
self.coord = coord
|
||||||
|
self.textlines = [textline]
|
||||||
|
self.align = align
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
text_inside = " | ".join(
|
||||||
|
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
||||||
|
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
||||||
|
f"textlines text='{text_inside}...'>"
|
||||||
|
|
||||||
|
def register_aligned_textline(self, textline, coord):
|
||||||
|
"""Update new textline to this alignment, adapting its average."""
|
||||||
|
# Increase the intersections for this segment, expand it up,
|
||||||
|
# and adjust the x based on the new value
|
||||||
|
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||||
|
float(len(self.textlines) + 1)
|
||||||
|
self.textlines.append(textline)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TextEdge(TextAlignment):
|
||||||
"""Defines a text edge coordinates relative to a left-bottom
|
"""Defines a text edge coordinates relative to a left-bottom
|
||||||
origin. (PDF coordinate space)
|
origin. (PDF coordinate space).
|
||||||
|
|
||||||
|
An edge is an alignment bounded over a segment.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -53,11 +110,10 @@ class TextEdge(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, coord, y0, y1, align="left"):
|
def __init__(self, coord, textline, y0, y1, align):
|
||||||
self.coord = coord
|
super().__init__(coord, textline, align)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.y1 = y1
|
self.y1 = y1
|
||||||
self.align = align
|
|
||||||
self.intersections = 0
|
self.intersections = 0
|
||||||
self.is_valid = False
|
self.is_valid = False
|
||||||
|
|
||||||
|
|
@ -70,14 +126,13 @@ class TextEdge(object):
|
||||||
self.is_valid,
|
self.is_valid,
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_coords(self, x, y0, edge_tol=50):
|
def update_coords(self, x, textline, edge_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
the is_valid attribute.
|
the is_valid attribute.
|
||||||
"""
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=edge_tol):
|
if np.isclose(self.y0, textline.y0, atol=edge_tol):
|
||||||
self.coord = (self.intersections * self.coord + x) / \
|
self.register_aligned_textline(textline, x)
|
||||||
float(self.intersections + 1)
|
self.y0 = textline.y0
|
||||||
self.y0 = y0
|
|
||||||
self.intersections += 1
|
self.intersections += 1
|
||||||
# a textedge is valid only if it extends uninterrupted
|
# a textedge is valid only if it extends uninterrupted
|
||||||
# over a required number of textlines
|
# over a required number of textlines
|
||||||
|
|
@ -85,22 +140,18 @@ class TextEdge(object):
|
||||||
self.is_valid = True
|
self.is_valid = True
|
||||||
|
|
||||||
|
|
||||||
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
|
class TextAlignments(object):
|
||||||
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
|
|
||||||
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
|
|
||||||
|
|
||||||
|
|
||||||
class BaseTextEdges(object):
|
|
||||||
"""Defines a dict of text edges accross alignment references.
|
"""Defines a dict of text edges accross alignment references.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, alignment_names):
|
def __init__(self, alignment_names):
|
||||||
|
# For each possible alignment, list of tuples coordinate/textlines
|
||||||
self._textedges = {}
|
self._textedges = {}
|
||||||
for alignment_name in alignment_names:
|
for alignment_name in alignment_names:
|
||||||
self._textedges[alignment_name] = []
|
self._textedges[alignment_name] = []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_new_text_edge(coord, textline, align=None):
|
def _create_new_text_edge(coord, textline, align):
|
||||||
return NotImplemented
|
return NotImplemented
|
||||||
|
|
||||||
def _update_edge(self, edge, coord, textline):
|
def _update_edge(self, edge, coord, textline):
|
||||||
|
|
@ -131,12 +182,12 @@ class BaseTextEdges(object):
|
||||||
idx_insert = idx_closest
|
idx_insert = idx_closest
|
||||||
if idx_insert is not None:
|
if idx_insert is not None:
|
||||||
new_edge = self._create_new_text_edge(
|
new_edge = self._create_new_text_edge(
|
||||||
coord, textline, align=alignment
|
coord, textline, alignment
|
||||||
)
|
)
|
||||||
edge_array.insert(idx_insert, new_edge)
|
edge_array.insert(idx_insert, new_edge)
|
||||||
|
|
||||||
|
|
||||||
class TextEdges(BaseTextEdges):
|
class TextEdges(TextAlignments):
|
||||||
"""Defines a dict of left, right and middle text edges found on
|
"""Defines a dict of left, right and middle text edges found on
|
||||||
the PDF page. The dict has three keys based on the alignments,
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
and each key's value is a list of camelot.core.TextEdge objects.
|
and each key's value is a list of camelot.core.TextEdge objects.
|
||||||
|
|
@ -147,19 +198,19 @@ class TextEdges(BaseTextEdges):
|
||||||
self.edge_tol = edge_tol
|
self.edge_tol = edge_tol
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_new_text_edge(coord, textline, align=None):
|
def _create_new_text_edge(coord, textline, align):
|
||||||
y0 = textline.y0
|
y0 = textline.y0
|
||||||
y1 = textline.y1
|
y1 = textline.y1
|
||||||
return TextEdge(coord, y0, y1, align=align)
|
return TextEdge(coord, textline, y0, y1, align)
|
||||||
|
|
||||||
def add(self, coord, textline, align):
|
def add(self, coord, textline, align):
|
||||||
"""Adds a new text edge to the current dict.
|
"""Adds a new text edge to the current dict.
|
||||||
"""
|
"""
|
||||||
te = self._create_new_text_edge(coord, textline, align=align)
|
te = self._create_new_text_edge(coord, textline, align)
|
||||||
self._textedges[align].append(te)
|
self._textedges[align].append(te)
|
||||||
|
|
||||||
def _update_edge(self, edge, coord, textline):
|
def _update_edge(self, edge, coord, textline):
|
||||||
edge.update_coords(coord, textline.y0, self.edge_tol)
|
edge.update_coords(coord, textline, self.edge_tol)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines):
|
||||||
"""Generates the text edges dict based on horizontal text
|
"""Generates the text edges dict based on horizontal text
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import (BaseTextEdges, ALL_ALIGNMENTS)
|
from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS)
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
get_index_closest_point,
|
get_index_closest_point,
|
||||||
get_textline_coords,
|
get_textline_coords,
|
||||||
|
|
@ -137,45 +137,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
return new_bbox
|
return new_bbox
|
||||||
|
|
||||||
|
|
||||||
class TextEdge2(object):
|
|
||||||
"""Text edge coordinates relative to a left-bottom origin.
|
|
||||||
|
|
||||||
(PDF coordinate space)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
coord : float
|
|
||||||
coordinate of the text edge. Depending on the alignment
|
|
||||||
it could be a vertical or horizontal coordinate.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
textlines: array
|
|
||||||
Array of textlines that demonstrate this alignment.
|
|
||||||
coord: float
|
|
||||||
The coordinate aligned averaged out across textlines.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, coord, textline):
|
|
||||||
self.coord = coord
|
|
||||||
self.textlines = [textline]
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
text_inside = " | ".join(
|
|
||||||
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
|
|
||||||
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
|
|
||||||
f"textlines text='{text_inside}...'>"
|
|
||||||
|
|
||||||
def register_aligned_textline(self, textline, coord):
|
|
||||||
"""Update new textline to this alignment, adapting its average."""
|
|
||||||
# Increase the intersections for this segment, expand it up,
|
|
||||||
# and adjust the x based on the new value
|
|
||||||
self.coord = (self.coord * len(self.textlines) + coord) / \
|
|
||||||
float(len(self.textlines) + 1)
|
|
||||||
self.textlines.append(textline)
|
|
||||||
|
|
||||||
|
|
||||||
class Alignments(object):
|
class Alignments(object):
|
||||||
"""
|
"""
|
||||||
Represent the number of textlines aligned with this one across each edge.
|
Represent the number of textlines aligned with this one across each edge.
|
||||||
|
|
@ -241,7 +202,7 @@ class Alignments(object):
|
||||||
return (self.max_v()-1) * (self.max_h()-1)
|
return (self.max_v()-1) * (self.max_h()-1)
|
||||||
|
|
||||||
|
|
||||||
class TextEdges2(BaseTextEdges):
|
class TextEdges2(TextAlignments):
|
||||||
"""Defines a dict of vertical (top, bottom, middle) and
|
"""Defines a dict of vertical (top, bottom, middle) and
|
||||||
horizontal (left, right, and middle) text alignments found on
|
horizontal (left, right, and middle) text alignments found on
|
||||||
the PDF page. The dict has three keys based on the alignments,
|
the PDF page. The dict has three keys based on the alignments,
|
||||||
|
|
@ -250,15 +211,6 @@ class TextEdges2(BaseTextEdges):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__(ALL_ALIGNMENTS)
|
super().__init__(ALL_ALIGNMENTS)
|
||||||
# # For each possible alignment, list of tuples coordinate/textlines
|
|
||||||
# self._textedges = {
|
|
||||||
# "left": [],
|
|
||||||
# "right": [],
|
|
||||||
# "middle": [],
|
|
||||||
# "bottom": [],
|
|
||||||
# "top": [],
|
|
||||||
# "center": []
|
|
||||||
# }
|
|
||||||
# For each textline, dictionary "edge type" to
|
# For each textline, dictionary "edge type" to
|
||||||
# "number of textlines aligned"
|
# "number of textlines aligned"
|
||||||
self._textlines_alignments = {}
|
self._textlines_alignments = {}
|
||||||
|
|
@ -268,8 +220,8 @@ class TextEdges2(BaseTextEdges):
|
||||||
self.max_cols = None
|
self.max_cols = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_new_text_edge(coord, textline, align=None):
|
def _create_new_text_edge(coord, textline, align):
|
||||||
return TextEdge2(coord, textline)
|
return TextAlignment(coord, textline, align)
|
||||||
|
|
||||||
def _update_edge(self, edge, coord, textline):
|
def _update_edge(self, edge, coord, textline):
|
||||||
edge.register_aligned_textline(textline, coord)
|
edge.register_aligned_textline(textline, coord)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue