Refactor out _text_bbox

2020-04-24 15:18:38 -07:00 · 2020-04-24 15:18:38 -07:00 · a401d33fd9
parent 87d95a098c
commit a401d33fd9
6 changed files with 15 additions and 56 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -18,7 +18,7 @@ from ..utils import (
    bbox_from_str,
    text_in_bbox,
    text_in_bbox_per_axis,
-    bbox_from_text,
+    bbox_from_textlines,
    distance_tl_to_bbox,
    find_columns_coordinates
 )
@ -561,30 +561,6 @@ class Hybrid(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol

-    # FRHTODO: Check if needed, refactor with Stream
-    @staticmethod
-    def _text_bbox(t_bbox):
-        """Returns bounding box for the text present on a page.
-
-        Parameters
-        ----------
-        t_bbox : dict
-            Dict with two keys 'horizontal' and 'vertical' with lists of
-            LTTextLineHorizontals and LTTextLineVerticals respectively.
-
-        Returns
-        -------
-        text_bbox : tuple
-            Tuple (x0, y0, x1, y1) in pdf coordinate space.
-
-        """
-        xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
-        ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
-        xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
-        ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
-        text_bbox = (xmin, ymin, xmax, ymax)
-        return text_bbox
-
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _group_rows(text, row_tol=2):
@ -821,7 +797,7 @@ class Hybrid(BaseParser):
            tls_in_bbox = text_in_bbox(bbox, textlines)

            # and expand the text box to fully contain them
-            bbox = bbox_from_text(tls_in_bbox)
+            bbox = bbox_from_textlines(tls_in_bbox)

            # FRH: do we need to repeat this?
            # tls_in_bbox = text_in_bbox(bbox, textlines)
@ -864,8 +840,9 @@ class Hybrid(BaseParser):
            self.vertical_text
        )

-        text_x_min, text_y_min, text_x_max, text_y_max = \
-            self._text_bbox(self.t_bbox)
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
+        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -7,7 +7,12 @@ import numpy as np

 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
+from ..utils import (
+    bbox_from_str,
+    bbox_from_textlines,
+    text_in_bbox,
+    text_in_bbox_per_axis
+)


 class Stream(BaseParser):
@ -76,29 +81,6 @@ class Stream(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol

-    @staticmethod
-    def _text_bbox(t_bbox):
-        """Returns bounding box for the text present on a page.
-
-        Parameters
-        ----------
-        t_bbox : dict
-            Dict with two keys 'horizontal' and 'vertical' with lists of
-            LTTextLineHorizontals and LTTextLineVerticals respectively.
-
-        Returns
-        -------
-        text_bbox : tuple
-            Tuple (x0, y0, x1, y1) in pdf coordinate space.
-
-        """
-        xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
-        ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
-        xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
-        ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
-        text_bbox = (xmin, ymin, xmax, ymax)
-        return text_bbox
-
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
@ -328,8 +310,9 @@ class Stream(BaseParser):
            self.vertical_text
        )

-        text_x_min, text_y_min, text_x_max, text_y_max = \
-            self._text_bbox(self.t_bbox)
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
+        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -421,7 +421,6 @@ def bbox_from_str(bbox_str):
    y1 = float(y1)
    x2 = float(x2)
    y2 = float(y2)
-    # FRHTODO: do things still work if I do x1, y1, x2, y2?
    return (
        min(x1, x2),
        min(y1, y2),
@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
    return t_bbox


-def bbox_from_text(textlines):
+def bbox_from_textlines(textlines):
    """Returns the smallest bbox containing all the text objects passed as
    a parameters.

--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png