Refactor out _text_bbox

2020-04-24 15:18:38 -07:00 · 2020-04-24 15:18:38 -07:00 · a401d33fd9
parent 87d95a098c
commit a401d33fd9
6 changed files with 15 additions and 56 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -18,7 +18,7 @@ from ..utils import (
    bbox_from_str,
    text_in_bbox,
    text_in_bbox_per_axis,
-    bbox_from_text,
+    bbox_from_textlines,
    distance_tl_to_bbox,
    find_columns_coordinates
 )
@ -561,30 +561,6 @@ class Hybrid(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.
        Returns
        -------
        text_bbox : tuple
            Tuple (x0, y0, x1, y1) in pdf coordinate space.
        """
        xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
        ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
        xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
        ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
        text_bbox = (xmin, ymin, xmax, ymax)
        return text_bbox
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _group_rows(text, row_tol=2):
@ -821,7 +797,7 @@ class Hybrid(BaseParser):
            tls_in_bbox = text_in_bbox(bbox, textlines)
            # and expand the text box to fully contain them
-            bbox = bbox_from_text(tls_in_bbox)
+            bbox = bbox_from_textlines(tls_in_bbox)
            # FRH: do we need to repeat this?
            # tls_in_bbox = text_in_bbox(bbox, textlines)
@ -864,8 +840,9 @@ class Hybrid(BaseParser):
            self.vertical_text
        )
-        text_x_min, text_y_min, text_x_max, text_y_max = \
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
-            self._text_bbox(self.t_bbox)
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -7,7 +7,12 @@ import numpy as np
 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
+from ..utils import (
    bbox_from_str,
    bbox_from_textlines,
    text_in_bbox,
    text_in_bbox_per_axis
 )
 class Stream(BaseParser):
@ -76,29 +81,6 @@ class Stream(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol
    @staticmethod
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.
        Returns
        -------
        text_bbox : tuple
            Tuple (x0, y0, x1, y1) in pdf coordinate space.
        """
        xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
        ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
        xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
        ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
        text_bbox = (xmin, ymin, xmax, ymax)
        return text_bbox
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
@ -328,8 +310,9 @@ class Stream(BaseParser):
            self.vertical_text
        )
-        text_x_min, text_y_min, text_x_max, text_y_max = \
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
-            self._text_bbox(self.t_bbox)
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -421,7 +421,6 @@ def bbox_from_str(bbox_str):
    y1 = float(y1)
    x2 = float(x2)
    y2 = float(y2)
    # FRHTODO: do things still work if I do x1, y1, x2, y2?
    return (
        min(x1, x2),
        min(y1, y2),
@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
    return t_bbox
-def bbox_from_text(textlines):
+def bbox_from_textlines(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png