Refactoring TextEdges code across hybrid and stream

pull/153/head
Frh 2020-04-23 12:55:09 -07:00
parent 414708d8c7
commit adb14d3522
2 changed files with 49 additions and 41 deletions

View File

@ -33,8 +33,8 @@ class TextEdge(object):
Parameters Parameters
---------- ----------
x : float coord : float
x-coordinate of the text edge. coordinate of the text edge. Can be x or y.
y0 : float y0 : float
y-coordinate of bottommost point. y-coordinate of bottommost point.
y1 : float y1 : float
@ -52,8 +52,8 @@ class TextEdge(object):
""" """
def __init__(self, x, y0, y1, align="left"): def __init__(self, coord, y0, y1, align="left"):
self.x = x self.coord = coord
self.y0 = y0 self.y0 = y0
self.y1 = y1 self.y1 = y1
self.align = align self.align = align
@ -62,7 +62,7 @@ class TextEdge(object):
def __repr__(self): def __repr__(self):
return "<TextEdge x={} y0={} y1={} align={} valid={}>".format( return "<TextEdge x={} y0={} y1={} align={} valid={}>".format(
round(self.x, 2), round(self.coord, 2),
round(self.y0, 2), round(self.y0, 2),
round(self.y1, 2), round(self.y1, 2),
self.align, self.align,
@ -74,7 +74,7 @@ class TextEdge(object):
the is_valid attribute. the is_valid attribute.
""" """
if np.isclose(self.y0, y0, atol=edge_tol): if np.isclose(self.y0, y0, atol=edge_tol):
self.x = (self.intersections * self.x + x) / \ self.coord = (self.intersections * self.coord + x) / \
float(self.intersections + 1) float(self.intersections + 1)
self.y0 = y0 self.y0 = y0
self.intersections += 1 self.intersections += 1
@ -84,52 +84,59 @@ class TextEdge(object):
self.is_valid = True self.is_valid = True
class TextEdges(object): HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"]
VERTICAL_ALIGNMENTS = ["top", "bottom", "center"]
ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS
class BaseTextEdges(object):
"""Defines a dict of text edges accross alignment references.
"""
def __init__(self, alignment_names):
self._textedges = {}
for alignment_name in alignment_names:
self._textedges[alignment_name] = []
class TextEdges(BaseTextEdges):
"""Defines a dict of left, right and middle text edges found on """Defines a dict of left, right and middle text edges found on
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
and each key's value is a list of camelot.core.TextEdge objects. and each key's value is a list of camelot.core.TextEdge objects.
""" """
def __init__(self, edge_tol=50): def __init__(self, edge_tol=50):
super().__init__(HORIZONTAL_ALIGNMENTS)
self.edge_tol = edge_tol self.edge_tol = edge_tol
self._textedges = {"left": [], "right": [], "middle": []}
@staticmethod
def get_x_coord(textline, align):
"""Returns the x coordinate of a text row based on the
specified alignment.
"""
coords = get_textline_coords(textline)
return coords[align]
def find(self, x_coord, align): def find(self, x_coord, align):
"""Returns the index of an existing text edge using """Returns the index of an existing text edge using
the specified x coordinate and alignment. the specified x coordinate and alignment.
""" """
for i, te in enumerate(self._textedges[align]): for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord, atol=0.5): if np.isclose(te.coord, x_coord, atol=0.5):
return i return i
return None return None
def add(self, textline, align): def add(self, coord, textline, align):
"""Adds a new text edge to the current dict. """Adds a new text edge to the current dict.
""" """
x = self.get_x_coord(textline, align)
y0 = textline.y0 y0 = textline.y0
y1 = textline.y1 y1 = textline.y1
te = TextEdge(x, y0, y1, align=align) te = TextEdge(coord, y0, y1, align=align)
self._textedges[align].append(te) self._textedges[align].append(te)
def update(self, textline): def update(self, textline):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
for align in ["left", "right", "middle"]: coords = get_textline_coords(textline)
x_coord = self.get_x_coord(textline, align) for alignment, edge_array in self._textedges.items():
idx = self.find(x_coord, align) x_coord = coords[alignment]
idx = self.find(x_coord, alignment)
if idx is None: if idx is None:
self.add(textline, align) self.add(x_coord, textline, alignment)
else: else:
self._textedges[align][idx].update_coords( edge_array[idx].update_coords(
x_coord, textline.y0, edge_tol=self.edge_tol x_coord, textline.y0, edge_tol=self.edge_tol
) )
@ -184,12 +191,12 @@ class TextEdges(object):
return (x0, y0, x1, y1) return (x0, y0, x1, y1)
# sort relevant textedges in reading order # sort relevant textedges in reading order
relevant_textedges.sort(key=lambda te: (-te.y0, te.x)) relevant_textedges.sort(key=lambda te: (-te.y0, te.coord))
table_areas = {} table_areas = {}
for te in relevant_textedges: for te in relevant_textedges:
if not table_areas: if not table_areas:
table_areas[(te.x, te.y0, te.x, te.y1)] = None table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
else: else:
found = None found = None
for area in table_areas: for area in table_areas:
@ -198,13 +205,13 @@ class TextEdges(object):
found = area found = area
break break
if found is None: if found is None:
table_areas[(te.x, te.y0, te.x, te.y1)] = None table_areas[(te.coord, te.y0, te.coord, te.y1)] = None
else: else:
table_areas.pop(found) table_areas.pop(found)
updated_area = ( updated_area = (
found[0], found[0],
min(te.y0, found[1]), min(te.y0, found[1]),
max(found[2], te.x), max(found[2], te.coord),
max(found[3], te.y1), max(found[3], te.y1),
) )
table_areas[updated_area] = None table_areas[updated_area] = None

View File

@ -8,6 +8,7 @@ import copy
import warnings import warnings
from .base import BaseParser from .base import BaseParser
from ..core import (BaseTextEdges, ALL_ALIGNMENTS)
from ..utils import ( from ..utils import (
get_index_closest_point, get_index_closest_point,
get_textline_coords, get_textline_coords,
@ -240,7 +241,7 @@ class Alignments(object):
return (self.max_v()-1) * (self.max_h()-1) return (self.max_v()-1) * (self.max_h()-1)
class TextEdges2(object): class TextEdges2(BaseTextEdges):
"""Defines a dict of vertical (top, bottom, middle) and """Defines a dict of vertical (top, bottom, middle) and
horizontal (left, right, and middle) text alignments found on horizontal (left, right, and middle) text alignments found on
the PDF page. The dict has three keys based on the alignments, the PDF page. The dict has three keys based on the alignments,
@ -248,15 +249,16 @@ class TextEdges2(object):
""" """
def __init__(self): def __init__(self):
# For each possible alignment, list of tuples coordinate/textlines super().__init__(ALL_ALIGNMENTS)
self._textedges = { # # For each possible alignment, list of tuples coordinate/textlines
"left": [], # self._textedges = {
"right": [], # "left": [],
"middle": [], # "right": [],
"bottom": [], # "middle": [],
"top": [], # "bottom": [],
"center": [] # "top": [],
} # "center": []
# }
# For each textline, dictionary "edge type" to # For each textline, dictionary "edge type" to
# "number of textlines aligned" # "number of textlines aligned"
self._textlines_alignments = {} self._textlines_alignments = {}
@ -269,8 +271,7 @@ class TextEdges2(object):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
coords = get_textline_coords(textline) coords = get_textline_coords(textline)
for alignment in self._textedges: for alignment, edge_array in self._textedges.items():
edge_array = self._textedges[alignment]
coord = coords[alignment] coord = coords[alignment]
# Find the index of the closest existing element (or 0 if none) # Find the index of the closest existing element (or 0 if none)