Refactor out _text_bbox
parent
87d95a098c
commit
a401d33fd9
|
|
@ -18,7 +18,7 @@ from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
bbox_from_text,
|
bbox_from_textlines,
|
||||||
distance_tl_to_bbox,
|
distance_tl_to_bbox,
|
||||||
find_columns_coordinates
|
find_columns_coordinates
|
||||||
)
|
)
|
||||||
|
|
@ -561,30 +561,6 @@ class Hybrid(BaseParser):
|
||||||
self.row_tol = row_tol
|
self.row_tol = row_tol
|
||||||
self.column_tol = column_tol
|
self.column_tol = column_tol
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _text_bbox(t_bbox):
|
|
||||||
"""Returns bounding box for the text present on a page.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_bbox : dict
|
|
||||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
|
||||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text_bbox : tuple
|
|
||||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
|
||||||
|
|
||||||
"""
|
|
||||||
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
text_bbox = (xmin, ymin, xmax, ymax)
|
|
||||||
return text_bbox
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
# FRHTODO: Check if needed, refactor with Stream
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _group_rows(text, row_tol=2):
|
def _group_rows(text, row_tol=2):
|
||||||
|
|
@ -821,7 +797,7 @@ class Hybrid(BaseParser):
|
||||||
tls_in_bbox = text_in_bbox(bbox, textlines)
|
tls_in_bbox = text_in_bbox(bbox, textlines)
|
||||||
|
|
||||||
# and expand the text box to fully contain them
|
# and expand the text box to fully contain them
|
||||||
bbox = bbox_from_text(tls_in_bbox)
|
bbox = bbox_from_textlines(tls_in_bbox)
|
||||||
|
|
||||||
# FRH: do we need to repeat this?
|
# FRH: do we need to repeat this?
|
||||||
# tls_in_bbox = text_in_bbox(bbox, textlines)
|
# tls_in_bbox = text_in_bbox(bbox, textlines)
|
||||||
|
|
@ -864,8 +840,9 @@ class Hybrid(BaseParser):
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
self._text_bbox(self.t_bbox)
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
|
)
|
||||||
rows_grouped = self._group_rows(
|
rows_grouped = self._group_rows(
|
||||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,12 @@ import numpy as np
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (bbox_from_str, text_in_bbox, text_in_bbox_per_axis)
|
from ..utils import (
|
||||||
|
bbox_from_str,
|
||||||
|
bbox_from_textlines,
|
||||||
|
text_in_bbox,
|
||||||
|
text_in_bbox_per_axis
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -76,29 +81,6 @@ class Stream(BaseParser):
|
||||||
self.row_tol = row_tol
|
self.row_tol = row_tol
|
||||||
self.column_tol = column_tol
|
self.column_tol = column_tol
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _text_bbox(t_bbox):
|
|
||||||
"""Returns bounding box for the text present on a page.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_bbox : dict
|
|
||||||
Dict with two keys 'horizontal' and 'vertical' with lists of
|
|
||||||
LTTextLineHorizontals and LTTextLineVerticals respectively.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
text_bbox : tuple
|
|
||||||
Tuple (x0, y0, x1, y1) in pdf coordinate space.
|
|
||||||
|
|
||||||
"""
|
|
||||||
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
|
|
||||||
text_bbox = (xmin, ymin, xmax, ymax)
|
|
||||||
return text_bbox
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _group_rows(text, row_tol=2):
|
def _group_rows(text, row_tol=2):
|
||||||
"""Groups PDFMiner text objects into rows vertically
|
"""Groups PDFMiner text objects into rows vertically
|
||||||
|
|
@ -328,8 +310,9 @@ class Stream(BaseParser):
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
self._text_bbox(self.t_bbox)
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
|
)
|
||||||
rows_grouped = self._group_rows(
|
rows_grouped = self._group_rows(
|
||||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
|
|
|
||||||
|
|
@ -421,7 +421,6 @@ def bbox_from_str(bbox_str):
|
||||||
y1 = float(y1)
|
y1 = float(y1)
|
||||||
x2 = float(x2)
|
x2 = float(x2)
|
||||||
y2 = float(y2)
|
y2 = float(y2)
|
||||||
# FRHTODO: do things still work if I do x1, y1, x2, y2?
|
|
||||||
return (
|
return (
|
||||||
min(x1, x2),
|
min(x1, x2),
|
||||||
min(y1, y2),
|
min(y1, y2),
|
||||||
|
|
@ -487,7 +486,7 @@ def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def bbox_from_text(textlines):
|
def bbox_from_textlines(textlines):
|
||||||
"""Returns the smallest bbox containing all the text objects passed as
|
"""Returns the smallest bbox containing all the text objects passed as
|
||||||
a parameters.
|
a parameters.
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 105 KiB After Width: | Height: | Size: 105 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 197 KiB After Width: | Height: | Size: 197 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 113 KiB |
Loading…
Reference in New Issue