Refactor out _text_bbox

pull/153/head
Frh 2020-04-24 15:18:38 -07:00
parent 87d95a098c
commit a401d33fd9
6 changed files with 15 additions and 56 deletions

View File

@ -18,7 +18,7 @@ from ..utils import (
bbox_from_str,
text_in_bbox,
text_in_bbox_per_axis,
bbox_from_text,
bbox_from_textlines,
distance_tl_to_bbox,
find_columns_coordinates
)
@ -561,30 +561,6 @@ class Hybrid(BaseParser):
self.row_tol = row_tol
self.column_tol = column_tol
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _group_rows(text, row_tol=2):
@ -821,7 +797,7 @@ class Hybrid(BaseParser):
tls_in_bbox = text_in_bbox(bbox, textlines)
# and expand the text box to fully contain them
bbox = bbox_from_text(tls_in_bbox)
bbox = bbox_from_textlines(tls_in_bbox)
# FRH: do we need to repeat this?
# tls_in_bbox = text_in_bbox(bbox, textlines)
@ -864,8 +840,9 @@ class Hybrid(BaseParser):
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = \
self._text_bbox(self.t_bbox)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)

View File

@ -7,7 +7,12 @@ import numpy as np
from .base import BaseParser
from ..core import TextEdges
from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
from ..utils import (
bbox_from_str,
bbox_from_textlines,
text_in_bbox,
text_in_bbox_per_axis
)
class Stream(BaseParser):
@ -76,29 +81,6 @@ class Stream(BaseParser):
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
@ -328,8 +310,9 @@ class Stream(BaseParser):
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = \
self._text_bbox(self.t_bbox)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)

View File

@ -421,7 +421,6 @@ def bbox_from_str(bbox_str):
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
# FRHTODO: do things still work if I do x1, y1, x2, y2?
return (
min(x1, x2),
min(y1, y2),
@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
return t_bbox
def bbox_from_text(textlines):
def bbox_from_textlines(textlines):
"""Returns the smallest bbox containing all the text objects passed as
a parameters.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 197 KiB

After

Width:  |  Height:  |  Size: 197 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 113 KiB