diff --git a/camelot/core.py b/camelot/core.py index acf9433..bda612a 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -28,9 +28,66 @@ TEXTEDGE_REQUIRED_ELEMENTS = 4 TABLE_AREA_PADDING = 10 -class TextEdge(object): +HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"] +VERTICAL_ALIGNMENTS = ["top", "bottom", "center"] +ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS + + +class TextAlignment(object): + """Represents a list of textlines sharing an alignment on a coordinate. + + The alignment can be left/right/middle or top/bottom/center. + + (PDF coordinate space) + + Parameters + ---------- + coord : float + coordinate of the initial text edge. Depending on the alignment + it could be a vertical or horizontal coordinate. + textline : obj + the original textline to start the alignment + align : str + Name of the alignment (e.g. "left", "top", etc) + + Attributes + ---------- + coord : float + The coordinate aligned averaged out across textlines. It can be along + the x or y axis. + textlines : array + Array of textlines that demonstrate this alignment. + align : str + Name of the alignment (e.g. "left", "top", etc) + + """ + + def __init__(self, coord, textline, align): + self.coord = coord + self.textlines = [textline] + self.align = align + + def __repr__(self): + text_inside = " | ".join( + map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") + return f"" + + def register_aligned_textline(self, textline, coord): + """Update new textline to this alignment, adapting its average.""" + # Increase the intersections for this segment, expand it up, + # and adjust the x based on the new value + self.coord = (self.coord * len(self.textlines) + coord) / \ + float(len(self.textlines) + 1) + self.textlines.append(textline) + + + +class TextEdge(TextAlignment): """Defines a text edge coordinates relative to a left-bottom - origin. (PDF coordinate space) + origin. (PDF coordinate space). + + An edge is an alignment bounded over a segment. Parameters ---------- @@ -53,11 +110,10 @@ class TextEdge(object): """ - def __init__(self, coord, y0, y1, align="left"): - self.coord = coord + def __init__(self, coord, textline, y0, y1, align): + super().__init__(coord, textline, align) self.y0 = y0 self.y1 = y1 - self.align = align self.intersections = 0 self.is_valid = False @@ -70,14 +126,13 @@ class TextEdge(object): self.is_valid, ) - def update_coords(self, x, y0, edge_tol=50): + def update_coords(self, x, textline, edge_tol=50): """Updates the text edge's x and bottom y coordinates and sets the is_valid attribute. """ - if np.isclose(self.y0, y0, atol=edge_tol): - self.coord = (self.intersections * self.coord + x) / \ - float(self.intersections + 1) - self.y0 = y0 + if np.isclose(self.y0, textline.y0, atol=edge_tol): + self.register_aligned_textline(textline, x) + self.y0 = textline.y0 self.intersections += 1 # a textedge is valid only if it extends uninterrupted # over a required number of textlines @@ -85,22 +140,18 @@ class TextEdge(object): self.is_valid = True -HORIZONTAL_ALIGNMENTS = ["left", "right", "middle"] -VERTICAL_ALIGNMENTS = ["top", "bottom", "center"] -ALL_ALIGNMENTS = HORIZONTAL_ALIGNMENTS + VERTICAL_ALIGNMENTS - - -class BaseTextEdges(object): +class TextAlignments(object): """Defines a dict of text edges accross alignment references. """ def __init__(self, alignment_names): + # For each possible alignment, list of tuples coordinate/textlines self._textedges = {} for alignment_name in alignment_names: self._textedges[alignment_name] = [] @staticmethod - def _create_new_text_edge(coord, textline, align=None): + def _create_new_text_edge(coord, textline, align): return NotImplemented def _update_edge(self, edge, coord, textline): @@ -131,12 +182,12 @@ class BaseTextEdges(object): idx_insert = idx_closest if idx_insert is not None: new_edge = self._create_new_text_edge( - coord, textline, align=alignment + coord, textline, alignment ) edge_array.insert(idx_insert, new_edge) -class TextEdges(BaseTextEdges): +class TextEdges(TextAlignments): """Defines a dict of left, right and middle text edges found on the PDF page. The dict has three keys based on the alignments, and each key's value is a list of camelot.core.TextEdge objects. @@ -147,19 +198,19 @@ class TextEdges(BaseTextEdges): self.edge_tol = edge_tol @staticmethod - def _create_new_text_edge(coord, textline, align=None): + def _create_new_text_edge(coord, textline, align): y0 = textline.y0 y1 = textline.y1 - return TextEdge(coord, y0, y1, align=align) + return TextEdge(coord, textline, y0, y1, align) def add(self, coord, textline, align): """Adds a new text edge to the current dict. """ - te = self._create_new_text_edge(coord, textline, align=align) + te = self._create_new_text_edge(coord, textline, align) self._textedges[align].append(te) def _update_edge(self, edge, coord, textline): - edge.update_coords(coord, textline.y0, self.edge_tol) + edge.update_coords(coord, textline, self.edge_tol) def generate(self, textlines): """Generates the text edges dict based on horizontal text diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 0c20c0f..00bc7eb 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -8,7 +8,7 @@ import copy import warnings from .base import BaseParser -from ..core import (BaseTextEdges, ALL_ALIGNMENTS) +from ..core import (TextAlignment, TextAlignments, ALL_ALIGNMENTS) from ..utils import ( get_index_closest_point, get_textline_coords, @@ -137,45 +137,6 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap): return new_bbox -class TextEdge2(object): - """Text edge coordinates relative to a left-bottom origin. - - (PDF coordinate space) - - Parameters - ---------- - coord : float - coordinate of the text edge. Depending on the alignment - it could be a vertical or horizontal coordinate. - - Attributes - ---------- - textlines: array - Array of textlines that demonstrate this alignment. - coord: float - The coordinate aligned averaged out across textlines. - - """ - - def __init__(self, coord, textline): - self.coord = coord - self.textlines = [textline] - - def __repr__(self): - text_inside = " | ".join( - map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "") - return f"" - - def register_aligned_textline(self, textline, coord): - """Update new textline to this alignment, adapting its average.""" - # Increase the intersections for this segment, expand it up, - # and adjust the x based on the new value - self.coord = (self.coord * len(self.textlines) + coord) / \ - float(len(self.textlines) + 1) - self.textlines.append(textline) - - class Alignments(object): """ Represent the number of textlines aligned with this one across each edge. @@ -241,7 +202,7 @@ class Alignments(object): return (self.max_v()-1) * (self.max_h()-1) -class TextEdges2(BaseTextEdges): +class TextEdges2(TextAlignments): """Defines a dict of vertical (top, bottom, middle) and horizontal (left, right, and middle) text alignments found on the PDF page. The dict has three keys based on the alignments, @@ -250,15 +211,6 @@ class TextEdges2(BaseTextEdges): def __init__(self): super().__init__(ALL_ALIGNMENTS) - # # For each possible alignment, list of tuples coordinate/textlines - # self._textedges = { - # "left": [], - # "right": [], - # "middle": [], - # "bottom": [], - # "top": [], - # "center": [] - # } # For each textline, dictionary "edge type" to # "number of textlines aligned" self._textlines_alignments = {} @@ -268,8 +220,8 @@ class TextEdges2(BaseTextEdges): self.max_cols = None @staticmethod - def _create_new_text_edge(coord, textline, align=None): - return TextEdge2(coord, textline) + def _create_new_text_edge(coord, textline, align): + return TextAlignment(coord, textline, align) def _update_edge(self, edge, coord, textline): edge.register_aligned_textline(textline, coord)