More refactoring / linting
parent
d673a3b6e0
commit
58823e57e9
|
|
@ -431,6 +431,12 @@ class Table(object):
|
||||||
self.whitespace = compute_whitespace(data)
|
self.whitespace = compute_whitespace(data)
|
||||||
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
||||||
|
|
||||||
|
_text = []
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
|
||||||
|
self._text = _text
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_image(self):
|
def get_pdf_image(self):
|
||||||
"""Compute pdf image and cache it
|
"""Compute pdf image and cache it
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ from ..utils import (
|
||||||
scale_image,
|
scale_image,
|
||||||
scale_pdf,
|
scale_pdf,
|
||||||
segments_in_bbox,
|
segments_in_bbox,
|
||||||
text_in_bbox,
|
text_in_bbox_per_axis,
|
||||||
merge_close_lines,
|
merge_close_lines,
|
||||||
)
|
)
|
||||||
from ..image_processing import (
|
from ..image_processing import (
|
||||||
|
|
@ -252,19 +252,17 @@ class Lattice(BaseParser):
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
|
||||||
|
def _generate_columns_and_rows(self, tk):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
tk, self.vertical_segments, self.horizontal_segments
|
tk, self.vertical_segments, self.horizontal_segments
|
||||||
)
|
)
|
||||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
tk,
|
||||||
|
self.horizontal_text,
|
||||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
self.vertical_text
|
||||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
)
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
|
||||||
|
|
||||||
cols, rows = zip(*self.table_bbox[tk])
|
cols, rows = zip(*self.table_bbox[tk])
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
|
|
@ -299,10 +297,6 @@ class Lattice(BaseParser):
|
||||||
table.record_parse_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
table._text = _text
|
|
||||||
table._image = self.pdf_image # Reuse the image used for calc
|
table._image = self.pdf_image # Reuse the image used for calc
|
||||||
table._bbox_unscaled = self.table_bbox_unscaled
|
table._bbox_unscaled = self.table_bbox_unscaled
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
|
@ -321,8 +315,7 @@ class Lattice(BaseParser):
|
||||||
for table_idx, tk in enumerate(
|
for table_idx, tk in enumerate(
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
):
|
):
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
|
||||||
table_idx, tk)
|
|
||||||
table = self._generate_table(
|
table = self._generate_table(
|
||||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
table._bbox = tk
|
table._bbox = tk
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import numpy as np
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (text_in_bbox)
|
from ..utils import (text_in_bbox, text_in_bbox_per_axis)
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(BaseParser):
|
||||||
|
|
@ -331,14 +331,11 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, table_idx, tk):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
t_bbox = {}
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
|
tk,
|
||||||
t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
|
self.horizontal_text,
|
||||||
|
self.vertical_text
|
||||||
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
)
|
||||||
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
|
||||||
|
|
||||||
self.t_bbox = t_bbox
|
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = \
|
text_x_min, text_y_min, text_x_max, text_y_max = \
|
||||||
self._text_bbox(self.t_bbox)
|
self._text_bbox(self.t_bbox)
|
||||||
|
|
@ -415,10 +412,6 @@ class Stream(BaseParser):
|
||||||
table.record_parse_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
|
||||||
table._text = _text
|
|
||||||
table._bbox = self.table_bbox
|
table._bbox = self.table_bbox
|
||||||
table._segments = None
|
table._segments = None
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
@ -435,12 +428,12 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
_tables = []
|
_tables = []
|
||||||
# sort tables based on y-coord
|
# sort tables based on y-coord
|
||||||
for table_idx, tk in enumerate(
|
for table_idx, bbox in enumerate(
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
):
|
):
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, tk)
|
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
table = self._generate_table(table_idx, cols, rows)
|
||||||
table._bbox = tk
|
table._bbox = bbox
|
||||||
_tables.append(table)
|
_tables.append(table)
|
||||||
|
|
||||||
return _tables
|
return _tables
|
||||||
|
|
|
||||||
|
|
@ -417,6 +417,35 @@ def text_in_bbox(bbox, text):
|
||||||
return t_bbox
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def text_in_bbox_per_axis(bbox, horizontal_text, vertical_text):
|
||||||
|
"""Returns all text objects present inside a bounding box, split between
|
||||||
|
horizontal and vertical text.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bbox : tuple
|
||||||
|
Tuple (x1, y1, x2, y2) representing a bounding box where
|
||||||
|
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
|
||||||
|
space.
|
||||||
|
horizontal_text : List of PDFMiner text objects.
|
||||||
|
vertical_text : List of PDFMiner text objects.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t_bbox : dict
|
||||||
|
Dict of lists of PDFMiner text objects that lie inside table, with one
|
||||||
|
key each for "horizontal" and "vertical"
|
||||||
|
|
||||||
|
"""
|
||||||
|
t_bbox = {}
|
||||||
|
t_bbox["horizontal"] = text_in_bbox(bbox, horizontal_text)
|
||||||
|
t_bbox["vertical"] = text_in_bbox(bbox, vertical_text)
|
||||||
|
|
||||||
|
t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
|
||||||
|
t_bbox["vertical"].sort(key=lambda x: (x.x0, -x.y0))
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def bbox_from_text(textlines):
|
def bbox_from_text(textlines):
|
||||||
"""Returns the smallest bbox containing all the text objects passed as
|
"""Returns the smallest bbox containing all the text objects passed as
|
||||||
a parameters.
|
a parameters.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue