Define TextEdge as a bounded TextAlignment

pull/153/head
Frh 2020-04-23 18:26:55 -07:00
parent 0b8aac977a
commit 2d97fbc036
2 changed files with 78 additions and 75 deletions

View File

@ -28,9 +28,66 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4
TABLE_AREA_PADDING = 10 TABLE_AREA_PADDING = 10
class TextEdge(object): HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class TextAlignment(object):
"""Represents a list of textlines sharing an alignment on a coordinate.
The alignment can be left/right/middle or top/bottom/center.
(PDF coordinate space)
Parameters
----------
coord : float
coordinate of the initial text edge. Depending on the alignment
it could be a vertical or horizontal coordinate.
textline : obj
the original textline to start the alignment
align : str
Name of the alignment (e.g. "left", "top", etc)
Attributes
----------
coord : float
The coordinate aligned averaged out across textlines. It can be along
the x or y axis.
textlines : array
Array of textlines that demonstrate this alignment.
align : str
Name of the alignment (e.g. "left", "top", etc)
"""
def __init__(self, coord, textline, align):
self.coord = coord
self.textlines = [textline]
self.align = align
def __repr__(self):
text_inside = " | ".join(
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
f"textlines text='{text_inside}...'>"
def register_aligned_textline(self, textline, coord):
"""Update new textline to this alignment, adapting its average."""
# Increase the intersections for this segment, expand it up,
# and adjust the x based on the new value
self.coord = (self.coord * len(self.textlines) + coord) / \
float(len(self.textlines) + 1)
self.textlines.append(textline)
class TextEdge(TextAlignment):
"""Defines a text edge coordinates relative to a left-bottom """Defines a text edge coordinates relative to a left-bottom
origin. (PDF coordinate space) origin. (PDF coordinate space).
An edge is an alignment bounded over a segment.
Parameters Parameters
---------- ----------
@ -53,11 +110,10 @@ class TextEdge(object):
""" """
def __init__(self, coord, y0, y1, align="left"): def __init__(self, coord, textline, y0, y1, align):
self.coord = coord super().__init__(coord, textline, align)
self.y0 = y0 self.y0 = y0
self.y1 = y1 self.y1 = y1
self.align = align
self.intersections = 0 self.intersections = 0
self.is_valid = False self.is_valid = False
@ -70,14 +126,13 @@ class TextEdge(object):
self.is_valid, self.is_valid,
) )
def update_coords(self, x, y0, edge_tol=50): def update_coords(self, x, textline, edge_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets """Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_tol): if np.isclose(self.y0, textline.y0, atol=edge_tol):
self.coord = (self.intersections * self.coord + x) / \ self.register_aligned_textline(textline, x)
float(self.intersections + 1) self.y0 = textline.y0
self.y0 = y0
self.intersections += 1 self.intersections += 1
# a textedge is valid only if it extends uninterrupted # a textedge is valid only if it extends uninterrupted
# over a required number of textlines # over a required number of textlines
@ -85,22 +140,18 @@ class TextEdge(object):
self.is_valid = True self.is_valid = True
HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"] class TextAlignments(object):
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class BaseTextEdges(object):
"""Defines a dict of text edges accross alignment references. """Defines a dict of text edges accross alignment references.
""" """
def __init__(self, alignment_names): def __init__(self, alignment_names):
# For each possible alignment, list of tuples coordinate/textlines
self._textedges = {} self._textedges = {}
for alignment_name in alignment_names: for alignment_name in alignment_names:
self._textedges[alignment_name] = [] self._textedges[alignment_name] = []
@staticmethod @staticmethod
def _create_new_text_edge(coord, textline, align=None): def _create_new_text_edge(coord, textline, align):
return NotImplemented return NotImplemented
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
@ -131,12 +182,12 @@ class BaseTextEdges(object):
idx_insert = idx_closest idx_insert = idx_closest
if idx_insert is not None: if idx_insert is not None:
new_edge = self._create_new_text_edge( new_edge = self._create_new_text_edge(
coord, textline, align=alignment coord, textline, alignment
) )
edge_array.insert(idx_insert, new_edge) edge_array.insert(idx_insert, new_edge)
class TextEdges(BaseTextEdges): class TextEdges(TextAlignments):
"""Defines a dict of left, right and middle text edges found on """Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
@ -147,19 +198,19 @@ class TextEdges(BaseTextEdges):
self.edge_tol = edge_tol self.edge_tol = edge_tol
@staticmethod @staticmethod
def _create_new_text_edge(coord, textline, align=None): def _create_new_text_edge(coord, textline, align):
y0 = textline.y0 y0 = textline.y0
y1 = textline.y1 y1 = textline.y1
return TextEdge(coord, y0, y1, align=align) return TextEdge(coord, textline, y0, y1, align)
def add(self, coord, textline, align): def add(self, coord, textline, align):
"""Adds a new text edge to the current dict. """Adds a new text edge to the current dict.
""" """
te = self._create_new_text_edge(coord, textline, align=align) te = self._create_new_text_edge(coord, textline, align)
self._textedges[align].append(te) self._textedges[align].append(te)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
edge.update_coords(coord, textline.y0, self.edge_tol) edge.update_coords(coord, textline, self.edge_tol)
def generate(self, textlines): def generate(self, textlines):
"""Generates the text edges dict based on horizontal text """Generates the text edges dict based on horizontal text

View File

@ -8,7 +8,7 @@ import copy
import warnings import warnings
from .base import BaseParser from .base import BaseParser
from ..core import (BaseTextEdges, ALL_ALIGNMENTS) from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS)
from ..utils import ( from ..utils import (
get_index_closest_point, get_index_closest_point,
get_textline_coords, get_textline_coords,
@ -137,45 +137,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
return new_bbox return new_bbox
class TextEdge2(object):
"""Text edge coordinates relative to a left-bottom origin.
(PDF coordinate space)
Parameters
----------
coord : float
coordinate of the text edge. Depending on the alignment
it could be a vertical or horizontal coordinate.
Attributes
----------
textlines: array
Array of textlines that demonstrate this alignment.
coord: float
The coordinate aligned averaged out across textlines.
"""
def __init__(self, coord, textline):
self.coord = coord
self.textlines = [textline]
def __repr__(self):
text_inside = " | ".join(
map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
f"textlines text='{text_inside}...'>"
def register_aligned_textline(self, textline, coord):
"""Update new textline to this alignment, adapting its average."""
# Increase the intersections for this segment, expand it up,
# and adjust the x based on the new value
self.coord = (self.coord * len(self.textlines) + coord) / \
float(len(self.textlines) + 1)
self.textlines.append(textline)
class Alignments(object): class Alignments(object):
""" """
Represent the number of textlines aligned with this one across each edge. Represent the number of textlines aligned with this one across each edge.
@ -241,7 +202,7 @@ class Alignments(object):
return (self.max_v()-1) * (self.max_h()-1) return (self.max_v()-1) * (self.max_h()-1)
class TextEdges2(BaseTextEdges): class TextEdges2(TextAlignments):
"""Defines a dict of vertical (top, bottom, middle) and """Defines a dict of vertical (top, bottom, middle) and
horizontal (left, right, and middle) text alignments found on horizontal (left, right, and middle) text alignments found on
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
@ -250,15 +211,6 @@ class TextEdges2(BaseTextEdges):
def __init__(self): def __init__(self):
super().__init__(ALL_ALIGNMENTS) super().__init__(ALL_ALIGNMENTS)
# # For each possible alignment, list of tuples coordinate/textlines
# self._textedges = {
# "left": [],
# "right": [],
# "middle": [],
# "bottom": [],
# "top": [],
# "center": []
# }
# For each textline, dictionary "edge type" to # For each textline, dictionary "edge type" to
# "number of textlines aligned" # "number of textlines aligned"
self._textlines_alignments = {} self._textlines_alignments = {}
@ -268,8 +220,8 @@ class TextEdges2(BaseTextEdges):
self.max_cols = None self.max_cols = None
@staticmethod @staticmethod
def _create_new_text_edge(coord, textline, align=None): def _create_new_text_edge(coord, textline, align):
return TextEdge2(coord, textline) return TextAlignment(coord, textline, align)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
edge.register_aligned_textline(textline, coord) edge.register_aligned_textline(textline, coord)