Refactor out _text_bbox
parent
87d95a098c
commit
a401d33fd9
|
|
@ -18,7 +18,7 @@ from ..utils import (
|
|||
bbox_from_str,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis,
|
||||
bbox_from_text,
|
||||
bbox_from_textlines,
|
||||
distance_tl_to_bbox,
|
||||
find_columns_coordinates
|
||||
)
|
||||
|
|
@ -561,30 +561,6 @@ class Hybrid(BaseParser):
|
|||
self.row_tol = row_tol
|
||||
self.column_tol = column_tol
|
||||
|
||||
# FRHTODO: Check if needed, refactor with Stream
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||
|
||||
"""
|
||||
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
|
||||
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
|
||||
text_bbox = (xmin, ymin, xmax, ymax)
|
||||
return text_bbox
|
||||
|
||||
# FRHTODO: Check if needed, refactor with Stream
|
||||
@staticmethod
|
||||
def _group_rows(text, row_tol=2):
|
||||
|
|
@ -821,7 +797,7 @@ class Hybrid(BaseParser):
|
|||
tls_in_bbox = text_in_bbox(bbox, textlines)
|
||||
|
||||
# and expand the text box to fully contain them
|
||||
bbox = bbox_from_text(tls_in_bbox)
|
||||
bbox = bbox_from_textlines(tls_in_bbox)
|
||||
|
||||
# FRH: do we need to repeat this?
|
||||
# tls_in_bbox = text_in_bbox(bbox, textlines)
|
||||
|
|
@ -864,8 +840,9 @@ class Hybrid(BaseParser):
|
|||
self.vertical_text
|
||||
)
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
||||
self._text_bbox(self.t_bbox)
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||
)
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,12 @@ import numpy as np
|
|||
|
||||
from .base import BaseParser
|
||||
from ..core import TextEdges
|
||||
from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
|
||||
from ..utils import (
|
||||
bbox_from_str,
|
||||
bbox_from_textlines,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis
|
||||
)
|
||||
|
||||
|
||||
class Stream(BaseParser):
|
||||
|
|
@ -76,29 +81,6 @@ class Stream(BaseParser):
|
|||
self.row_tol = row_tol
|
||||
self.column_tol = column_tol
|
||||
|
||||
@staticmethod
|
||||
def _text_bbox(t_bbox):
|
||||
"""Returns bounding box for the text present on a page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t_bbox : dict
|
||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
||||
|
||||
Returns
|
||||
-------
|
||||
text_bbox : tuple
|
||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
||||
|
||||
"""
|
||||
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
|
||||
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
|
||||
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
|
||||
text_bbox = (xmin, ymin, xmax, ymax)
|
||||
return text_bbox
|
||||
|
||||
@staticmethod
|
||||
def _group_rows(text, row_tol=2):
|
||||
"""Groups PDFMiner text objects into rows vertically
|
||||
|
|
@ -328,8 +310,9 @@ class Stream(BaseParser):
|
|||
self.vertical_text
|
||||
)
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
||||
self._text_bbox(self.t_bbox)
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||
)
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
|
|
|
|||
|
|
@ -421,7 +421,6 @@ def bbox_from_str(bbox_str):
|
|||
y1 = float(y1)
|
||||
x2 = float(x2)
|
||||
y2 = float(y2)
|
||||
# FRHTODO: do things still work if I do x1, y1, x2, y2?
|
||||
return (
|
||||
min(x1, x2),
|
||||
min(y1, y2),
|
||||
|
|
@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
|
|||
return t_bbox
|
||||
|
||||
|
||||
def bbox_from_text(textlines):
|
||||
def bbox_from_textlines(textlines):
|
||||
"""Returns the smallest bbox containing all the text objects passed as
|
||||
a parameters.
|
||||
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 105 KiB After Width: | Height: | Size: 105 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 197 KiB After Width: | Height: | Size: 197 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 113 KiB |
Loading…
Reference in New Issue