Common parent TextBaseParser for Stream and Hybrid

pull/153/head
Frh 2020-04-24 15:54:58 -07:00
parent a401d33fd9
commit 18581640be
5 changed files with 87 additions and 92 deletions

View File

@ -136,7 +136,7 @@ class TextEdge(TextAlignment):
class TextAlignments(object): class TextAlignments(object):
"""Defines a dict of text edges accross alignment references. """Defines a dict of text edges across reference alignments.
""" """
def __init__(self, alignment_names): def __init__(self, alignment_names):

View File

@ -169,3 +169,62 @@ class BaseParser(object):
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text table.cells[r_idx][c_idx].text = text
return pos_errors return pos_errors
def extract_tables(self):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(
bbox,
table_idx
)
table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = bbox
_tables.append(table)
return _tables
class TextBaseParser(BaseParser):
"""Base class for all text parsers.
"""
def __init__(
self,
parser_id,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
super().__init__(
"stream",
table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
flag_size=flag_size,
)
self.columns = columns
self._validate_columns()
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
self.textedges = None

View File

@ -7,7 +7,7 @@ import numpy as np
import copy import copy
import warnings import warnings
from .base import BaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
TextAlignments, TextAlignments,
ALL_ALIGNMENTS, ALL_ALIGNMENTS,
@ -257,7 +257,7 @@ class TextNetworks(TextAlignments):
for align_id in self._textedges: for align_id in self._textedges:
edge_array = self._textedges[align_id] edge_array = self._textedges[align_id]
gaps = [] gaps = []
vertical = align_id in ["left", "right", "middle"] vertical = align_id in HORIZONTAL_ALIGNMENTS
sort_function = (lambda tl: tl.y0) \ sort_function = (lambda tl: tl.y0) \
if vertical \ if vertical \
else (lambda tl: tl.x0) else (lambda tl: tl.x0)
@ -491,7 +491,7 @@ class TextNetworks(TextAlignments):
) )
class Hybrid(BaseParser): class Hybrid(TextBaseParser):
"""Hybrid method of parsing looks for spaces between text """Hybrid method of parsing looks for spaces between text
to parse the table. to parse the table.
@ -548,18 +548,14 @@ class Hybrid(BaseParser):
"hybrid", "hybrid",
table_regions=table_regions, table_regions=table_regions,
table_areas=table_areas, table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text, split_text=split_text,
strip_text=strip_text, strip_text=strip_text,
flag_size=flag_size, edge_tol=edge_tol,
debug=debug row_tol=row_tol,
column_tol=column_tol,
) )
self.columns = columns
self.textedges = None
self._validate_columns()
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
# FRHTODO: Check if needed, refactor with Stream # FRHTODO: Check if needed, refactor with Stream
@staticmethod @staticmethod
@ -832,10 +828,10 @@ class Hybrid(BaseParser):
)) ))
# FRHTODO: Check is needed, refactor with Stream # FRHTODO: Check is needed, refactor with Stream
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox # select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis( self.t_bbox = text_in_bbox_per_axis(
tk, bbox,
self.horizontal_text, self.horizontal_text,
self.vertical_text self.vertical_text
) )
@ -908,7 +904,7 @@ class Hybrid(BaseParser):
cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows, None, None
# FRHTODO: Check is needed, refactor with Stream # FRHTODO: Check is needed, refactor with Stream
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
@ -922,23 +918,3 @@ class Hybrid(BaseParser):
table._textedges = self.textedges table._textedges = self.textedges
return table return table
def extract_tables(self):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
table = self._generate_table(table_idx, cols, rows)
table._bbox = bbox
_tables.append(table)
return _tables

View File

@ -252,21 +252,21 @@ class Lattice(BaseParser):
table_bbox, vertical_segments, horizontal_segments, pdf_scalers table_bbox, vertical_segments, horizontal_segments, pdf_scalers
) )
def _generate_columns_and_rows(self, tk): def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox # select elements which lie within table_bbox
v_s, h_s = segments_in_bbox( v_s, h_s = segments_in_bbox(
tk, self.vertical_segments, self.horizontal_segments bbox, self.vertical_segments, self.horizontal_segments
) )
self.t_bbox = text_in_bbox_per_axis( self.t_bbox = text_in_bbox_per_axis(
tk, bbox,
self.horizontal_text, self.horizontal_text,
self.vertical_text self.vertical_text
) )
cols, rows = zip(*self.table_bbox[tk]) cols, rows = zip(*self.table_bbox[bbox])
cols, rows = list(cols), list(rows) cols, rows = list(cols), list(rows)
cols.extend([tk[0], tk[2]]) cols.extend([bbox[0], bbox[2]])
rows.extend([tk[1], tk[3]]) rows.extend([bbox[1], bbox[3]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
rows = merge_close_lines( rows = merge_close_lines(
@ -302,22 +302,3 @@ class Lattice(BaseParser):
table._textedges = None table._textedges = None
return table return table
def extract_tables(self):
if self._document_has_no_text():
return []
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, tk in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
table = self._generate_table(
table_idx, cols, rows, v_s=v_s, h_s=h_s)
table._bbox = tk
_tables.append(table)
return _tables

View File

@ -5,7 +5,7 @@ import warnings
import numpy as np import numpy as np
from .base import BaseParser from .base import TextBaseParser
from ..core import TextEdges from ..core import TextEdges
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
@ -15,7 +15,7 @@ from ..utils import (
) )
class Stream(BaseParser): class Stream(TextBaseParser):
"""Stream method of parsing looks for spaces between text """Stream method of parsing looks for spaces between text
to parse the table. to parse the table.
@ -71,15 +71,14 @@ class Stream(BaseParser):
"stream", "stream",
table_regions=table_regions, table_regions=table_regions,
table_areas=table_areas, table_areas=table_areas,
columns=columns,
flag_size=flag_size,
split_text=split_text, split_text=split_text,
strip_text=strip_text, strip_text=strip_text,
flag_size=flag_size, edge_tol=edge_tol,
row_tol=row_tol,
column_tol=column_tol,
) )
self.columns = columns
self._validate_columns()
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod @staticmethod
def _group_rows(text, row_tol=2): def _group_rows(text, row_tol=2):
@ -302,10 +301,10 @@ class Stream(BaseParser):
table_bbox[bbox_from_str(area_str)] = None table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox # select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis( self.t_bbox = text_in_bbox_per_axis(
tk, bbox,
self.horizontal_text, self.horizontal_text,
self.vertical_text self.vertical_text
) )
@ -378,7 +377,7 @@ class Stream(BaseParser):
cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max) cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows return cols, rows, None, None
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
@ -391,23 +390,3 @@ class Stream(BaseParser):
table._textedges = self.textedges table._textedges = self.textedges
return table return table
def extract_tables(self):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
table = self._generate_table(table_idx, cols, rows)
table._bbox = bbox
_tables.append(table)
return _tables