236 lines
9.3 KiB
Python
236 lines
9.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from ..utils import (
|
|
bboxes_overlap,
|
|
boundaries_to_split_lines,
|
|
)
|
|
|
|
import numpy as np
|
|
from .base import BaseParser
|
|
from .network import Network
|
|
from .lattice import Lattice
|
|
|
|
|
|
class Hybrid(BaseParser):
|
|
"""Defines a hybrid parser, leveraging both network and lattice parsers.
|
|
|
|
Parameters
|
|
----------
|
|
table_regions : list, optional (default: None)
|
|
List of page regions that may contain tables of the form x1,y1,x2,y2
|
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
|
in PDF coordinate space.
|
|
table_areas : list, optional (default: None)
|
|
List of table area strings of the form x1,y1,x2,y2
|
|
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
|
|
in PDF coordinate space.
|
|
columns : list, optional (default: None)
|
|
List of column x-coordinates strings where the coordinates
|
|
are comma-separated.
|
|
split_text : bool, optional (default: False)
|
|
Split text that spans across multiple cells.
|
|
flag_size : bool, optional (default: False)
|
|
Flag text based on font size. Useful to detect
|
|
super/subscripts. Adds <s></s> around flagged text.
|
|
strip_text : str, optional (default: '')
|
|
Characters that should be stripped from a string before
|
|
assigning it to a cell.
|
|
edge_tol : int, optional (default: 50)
|
|
Tolerance parameter for extending textedges vertically.
|
|
row_tol : int, optional (default: 2)
|
|
Tolerance parameter used to combine text vertically,
|
|
to generate rows.
|
|
column_tol : int, optional (default: 0)
|
|
Tolerance parameter used to combine text horizontally,
|
|
to generate columns.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
table_regions=None,
|
|
table_areas=None,
|
|
columns=None,
|
|
flag_size=False,
|
|
split_text=False,
|
|
strip_text="",
|
|
edge_tol=None,
|
|
row_tol=2,
|
|
column_tol=0,
|
|
debug=False,
|
|
**kwargs):
|
|
super().__init__(
|
|
"hybrid",
|
|
table_regions=table_regions,
|
|
table_areas=table_areas,
|
|
flag_size=flag_size,
|
|
split_text=split_text,
|
|
strip_text=strip_text,
|
|
debug=debug,
|
|
)
|
|
self.columns = columns # Columns settings impacts the hybrid table
|
|
self.network_parser = Network(
|
|
table_regions=table_regions,
|
|
table_areas=table_areas,
|
|
columns=columns,
|
|
flag_size=flag_size,
|
|
split_text=split_text,
|
|
strip_text=strip_text,
|
|
edge_tol=edge_tol,
|
|
row_tol=row_tol,
|
|
column_tol=column_tol,
|
|
debug=debug,
|
|
)
|
|
self.lattice_parser = Lattice(
|
|
table_regions=table_regions,
|
|
table_areas=table_areas,
|
|
flag_size=flag_size,
|
|
split_text=split_text,
|
|
strip_text=strip_text,
|
|
edge_tol=edge_tol,
|
|
row_tol=row_tol,
|
|
column_tol=column_tol,
|
|
debug=debug,
|
|
)
|
|
|
|
def prepare_page_parse(self, filename, layout, dimensions,
|
|
page_idx, layout_kwargs):
|
|
super().prepare_page_parse(filename, layout, dimensions,
|
|
page_idx, layout_kwargs)
|
|
self.network_parser.prepare_page_parse(
|
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
|
self.lattice_parser.prepare_page_parse(
|
|
filename, layout, dimensions, page_idx, layout_kwargs)
|
|
|
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
|
parser = self.table_bbox_parses[bbox]
|
|
return parser._generate_columns_and_rows(bbox, table_idx)
|
|
|
|
def _generate_table(self, table_idx, bbox, cols, rows, **kwargs):
|
|
parser = self.table_bbox_parses[bbox]
|
|
table = parser._generate_table(table_idx, bbox, cols, rows, **kwargs)
|
|
# Because hybrid can inject extraneous splits from both lattice and
|
|
# network, remove lines / cols that are completely empty.
|
|
table.df = table.df.replace('', np.nan)
|
|
table.df = table.df.dropna(axis=0, how="all")
|
|
table.df = table.df.dropna(axis=1, how="all")
|
|
table.df = table.df.replace(np.nan, '')
|
|
table.shape = table.df.shape
|
|
return table
|
|
|
|
@staticmethod
|
|
def _augment_boundaries_with_splits(boundaries, splits, tolerance=0):
|
|
""" Augment existing boundaries using provided hard splits.
|
|
|
|
Boundaries: |---| |-| |---------|
|
|
Splits: | | | |
|
|
Augmented: |-------|-----|-------|--|
|
|
"""
|
|
idx_boundaries = len(boundaries) - 1
|
|
idx_splits = len(splits) - 1
|
|
previous_boundary = None
|
|
while True:
|
|
if idx_splits < 0:
|
|
# No more splits to incorporate, we're done
|
|
break
|
|
split = splits[idx_splits]
|
|
|
|
if idx_boundaries < 0:
|
|
# Need to insert remaining splits
|
|
new_boundary = [split, boundaries[0][0]]
|
|
boundaries.insert(0, new_boundary)
|
|
idx_splits = idx_splits - 1
|
|
else:
|
|
boundary = \
|
|
boundaries[idx_boundaries]
|
|
if boundary[1] < \
|
|
split + tolerance:
|
|
# The lattice column is further to the right of our
|
|
# col boundary. We move our left boundary to match.
|
|
boundary[1] = split
|
|
# And if there was another segment after, we make its
|
|
# right boundary match as well so that there's no gap
|
|
if previous_boundary is not None:
|
|
previous_boundary[0] = split
|
|
idx_splits = idx_splits - 1
|
|
elif boundary[0] > \
|
|
split - tolerance:
|
|
# Our boundary is fully after the split, move on
|
|
idx_boundaries = idx_boundaries - 1
|
|
previous_boundary = boundary
|
|
if idx_boundaries < 0:
|
|
# If this is the last boundary to the left, set its
|
|
# edge at the split
|
|
boundary[0] = split
|
|
idx_splits = idx_splits - 1
|
|
else:
|
|
# The split is inside our boundary: split it
|
|
new_boundary = [split, boundary[1]]
|
|
boundaries.insert(idx_boundaries + 1, new_boundary)
|
|
boundary[1] = split
|
|
previous_boundary = new_boundary
|
|
idx_splits = idx_splits - 1
|
|
return boundaries
|
|
|
|
def _merge_bbox_analysis(self, lattice_bbox, network_bbox):
|
|
""" Identify splits that were only detected by lattice or by network
|
|
"""
|
|
lattice_parse = self.lattice_parser.table_bbox_parses[lattice_bbox]
|
|
lattice_cols = lattice_parse["col_anchors"]
|
|
|
|
network_bbox_data = self.network_parser.table_bbox_parses[network_bbox]
|
|
network_cols_boundaries = network_bbox_data["cols_boundaries"]
|
|
|
|
# Favor network, but complete or adjust its columns based on the
|
|
# splits identified by lattice.
|
|
if network_cols_boundaries is None:
|
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
|
else:
|
|
network_cols_boundaries = self._augment_boundaries_with_splits(
|
|
network_cols_boundaries,
|
|
lattice_cols,
|
|
self.lattice_parser.joint_tol)
|
|
augmented_bbox = (
|
|
network_cols_boundaries[0][0],
|
|
min(lattice_bbox[1], network_bbox[1]),
|
|
network_cols_boundaries[-1][1],
|
|
max(lattice_bbox[3], network_bbox[3]),
|
|
)
|
|
network_bbox_data["cols_anchors"] = \
|
|
boundaries_to_split_lines(network_cols_boundaries)
|
|
|
|
del self.network_parser.table_bbox_parses[network_bbox]
|
|
self.network_parser.table_bbox_parses[augmented_bbox] = \
|
|
network_bbox_data
|
|
self.table_bbox_parses[augmented_bbox] = self.network_parser
|
|
|
|
def _generate_table_bbox(self):
|
|
# Collect bboxes from both parsers
|
|
self.lattice_parser._generate_table_bbox()
|
|
_lattice_bboxes = sorted(
|
|
self.lattice_parser.table_bbox_parses,
|
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
|
self.network_parser._generate_table_bbox()
|
|
_network_bboxes = sorted(
|
|
self.network_parser.table_bbox_parses,
|
|
key=lambda bbox: (bbox[0], -bbox[1]))
|
|
|
|
# Merge the data from both processes
|
|
for lattice_bbox in _lattice_bboxes:
|
|
merged = False
|
|
|
|
for idx in range(len(_network_bboxes)-1, -1, -1):
|
|
network_bbox = _network_bboxes[idx]
|
|
if not bboxes_overlap(lattice_bbox, network_bbox):
|
|
continue
|
|
self._merge_bbox_analysis(lattice_bbox, network_bbox)
|
|
# network_bbox_data["cols_boundaries"]
|
|
del _network_bboxes[idx]
|
|
merged = True
|
|
if not merged:
|
|
self.table_bbox_parses[lattice_bbox] = self.lattice_parser
|
|
|
|
# Add the bboxes from network that haven't been merged
|
|
for network_bbox in _network_bboxes:
|
|
self.table_bbox_parses[network_bbox] = self.network_parser
|