diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 2e09306..869e2d2 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -18,7 +18,7 @@ from ..utils import ( bbox_from_str, text_in_bbox, text_in_bbox_per_axis, - bbox_from_text, + bbox_from_textlines, distance_tl_to_bbox, find_columns_coordinates ) @@ -561,30 +561,6 @@ class Hybrid(BaseParser): self.row_tol = row_tol self.column_tol = column_tol - # FRHTODO: Check if needed, refactor with Stream - @staticmethod - def _text_bbox(t_bbox): - """Returns bounding box for the text present on a page. - - Parameters - ---------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - - Returns - ------- - text_bbox : tuple - Tuple (x0, y0, x1, y1) in pdf coordinate space. - - """ - xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction]) - ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction]) - xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction]) - ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction]) - text_bbox = (xmin, ymin, xmax, ymax) - return text_bbox - # FRHTODO: Check if needed, refactor with Stream @staticmethod def _group_rows(text, row_tol=2): @@ -821,7 +797,7 @@ class Hybrid(BaseParser): tls_in_bbox = text_in_bbox(bbox, textlines) # and expand the text box to fully contain them - bbox = bbox_from_text(tls_in_bbox) + bbox = bbox_from_textlines(tls_in_bbox) # FRH: do we need to repeat this? # tls_in_bbox = text_in_bbox(bbox, textlines) @@ -864,8 +840,9 @@ class Hybrid(BaseParser): self.vertical_text ) - text_x_min, text_y_min, text_x_max, text_y_max = \ - self._text_bbox(self.t_bbox) + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + self.t_bbox["horizontal"] + self.t_bbox["vertical"] + ) rows_grouped = self._group_rows( self.t_bbox["horizontal"], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index eb3479c..08aaa41 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -7,7 +7,12 @@ import numpy as np from .base import BaseParser from ..core import TextEdges -from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis) +from ..utils import ( + bbox_from_str, + bbox_from_textlines, + text_in_bbox, + text_in_bbox_per_axis +) class Stream(BaseParser): @@ -76,29 +81,6 @@ class Stream(BaseParser): self.row_tol = row_tol self.column_tol = column_tol - @staticmethod - def _text_bbox(t_bbox): - """Returns bounding box for the text present on a page. - - Parameters - ---------- - t_bbox : dict - Dict with two keys 'horizontal' and 'vertical' with lists of - LTTextLineHorizontals and LTTextLineVerticals respectively. - - Returns - ------- - text_bbox : tuple - Tuple (x0, y0, x1, y1) in pdf coordinate space. - - """ - xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction]) - ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction]) - xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction]) - ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction]) - text_bbox = (xmin, ymin, xmax, ymax) - return text_bbox - @staticmethod def _group_rows(text, row_tol=2): """Groups PDFMiner text objects into rows vertically @@ -328,8 +310,9 @@ class Stream(BaseParser): self.vertical_text ) - text_x_min, text_y_min, text_x_max, text_y_max = \ - self._text_bbox(self.t_bbox) + text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines( + self.t_bbox["horizontal"] + self.t_bbox["vertical"] + ) rows_grouped = self._group_rows( self.t_bbox["horizontal"], row_tol=self.row_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) diff --git a/camelot/utils.py b/camelot/utils.py index d3895a6..752e4b5 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -421,7 +421,6 @@ def bbox_from_str(bbox_str): y1 = float(y1) x2 = float(x2) y2 = float(y2) - # FRHTODO: do things still work if I do x1, y1, x2, y2? return ( min(x1, x2), min(y1, y2), @@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text): return t_bbox -def bbox_from_text(textlines): +def bbox_from_textlines(textlines): """Returns the smallest bbox containing all the text objects passed as a parameters. diff --git a/tests/files/baseline_plots/test_hybrid_contour_plot.png b/tests/files/baseline_plots/test_hybrid_contour_plot.png index 121147e..26d2b57 100644 Binary files a/tests/files/baseline_plots/test_hybrid_contour_plot.png and b/tests/files/baseline_plots/test_hybrid_contour_plot.png differ diff --git a/tests/files/baseline_plots/test_hybrid_textedge_plot.png b/tests/files/baseline_plots/test_hybrid_textedge_plot.png index 4f858e5..fc9496b 100644 Binary files a/tests/files/baseline_plots/test_hybrid_textedge_plot.png and b/tests/files/baseline_plots/test_hybrid_textedge_plot.png differ diff --git a/tests/files/baseline_plots/test_stream_textedge_plot.png b/tests/files/baseline_plots/test_stream_textedge_plot.png index d2e3a50..b9ecf7d 100644 Binary files a/tests/files/baseline_plots/test_stream_textedge_plot.png and b/tests/files/baseline_plots/test_stream_textedge_plot.png differ