Common parent TextBaseParser for Stream and Hybrid
parent
a401d33fd9
commit
18581640be
|
|
@ -136,7 +136,7 @@ class TextEdge(TextAlignment):
|
||||||
|
|
||||||
|
|
||||||
class TextAlignments(object):
|
class TextAlignments(object):
|
||||||
"""Defines a dict of text edges accross alignment references.
|
"""Defines a dict of text edges across reference alignments.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, alignment_names):
|
def __init__(self, alignment_names):
|
||||||
|
|
|
||||||
|
|
@ -169,3 +169,62 @@ class BaseParser(object):
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].text = text
|
table.cells[r_idx][c_idx].text = text
|
||||||
return pos_errors
|
return pos_errors
|
||||||
|
|
||||||
|
def extract_tables(self):
|
||||||
|
if self._document_has_no_text():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Identify plausible areas within the doc where tables lie,
|
||||||
|
# populate table_bbox keys with these areas.
|
||||||
|
self._generate_table_bbox()
|
||||||
|
|
||||||
|
_tables = []
|
||||||
|
# sort tables based on y-coord
|
||||||
|
for table_idx, bbox in enumerate(
|
||||||
|
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
||||||
|
):
|
||||||
|
cols, rows, v_s, h_s = self._generate_columns_and_rows(
|
||||||
|
bbox,
|
||||||
|
table_idx
|
||||||
|
)
|
||||||
|
table = self._generate_table(
|
||||||
|
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
||||||
|
table._bbox = bbox
|
||||||
|
_tables.append(table)
|
||||||
|
|
||||||
|
return _tables
|
||||||
|
|
||||||
|
|
||||||
|
class TextBaseParser(BaseParser):
|
||||||
|
"""Base class for all text parsers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parser_id,
|
||||||
|
table_regions=None,
|
||||||
|
table_areas=None,
|
||||||
|
columns=None,
|
||||||
|
flag_size=False,
|
||||||
|
split_text=False,
|
||||||
|
strip_text="",
|
||||||
|
edge_tol=50,
|
||||||
|
row_tol=2,
|
||||||
|
column_tol=0,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
"stream",
|
||||||
|
table_regions=table_regions,
|
||||||
|
table_areas=table_areas,
|
||||||
|
split_text=split_text,
|
||||||
|
strip_text=strip_text,
|
||||||
|
flag_size=flag_size,
|
||||||
|
)
|
||||||
|
self.columns = columns
|
||||||
|
self._validate_columns()
|
||||||
|
self.edge_tol = edge_tol
|
||||||
|
self.row_tol = row_tol
|
||||||
|
self.column_tol = column_tol
|
||||||
|
|
||||||
|
self.textedges = None
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import numpy as np
|
||||||
import copy
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import (
|
from ..core import (
|
||||||
TextAlignments,
|
TextAlignments,
|
||||||
ALL_ALIGNMENTS,
|
ALL_ALIGNMENTS,
|
||||||
|
|
@ -257,7 +257,7 @@ class TextNetworks(TextAlignments):
|
||||||
for align_id in self._textedges:
|
for align_id in self._textedges:
|
||||||
edge_array = self._textedges[align_id]
|
edge_array = self._textedges[align_id]
|
||||||
gaps = []
|
gaps = []
|
||||||
vertical = align_id in ["left", "right", "middle"]
|
vertical = align_id in HORIZONTAL_ALIGNMENTS
|
||||||
sort_function = (lambda tl: tl.y0) \
|
sort_function = (lambda tl: tl.y0) \
|
||||||
if vertical \
|
if vertical \
|
||||||
else (lambda tl: tl.x0)
|
else (lambda tl: tl.x0)
|
||||||
|
|
@ -491,7 +491,7 @@ class TextNetworks(TextAlignments):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Hybrid(BaseParser):
|
class Hybrid(TextBaseParser):
|
||||||
"""Hybrid method of parsing looks for spaces between text
|
"""Hybrid method of parsing looks for spaces between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
||||||
|
|
@ -548,18 +548,14 @@ class Hybrid(BaseParser):
|
||||||
"hybrid",
|
"hybrid",
|
||||||
table_regions=table_regions,
|
table_regions=table_regions,
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
|
columns=columns,
|
||||||
|
flag_size=flag_size,
|
||||||
split_text=split_text,
|
split_text=split_text,
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
flag_size=flag_size,
|
edge_tol=edge_tol,
|
||||||
debug=debug
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
)
|
)
|
||||||
self.columns = columns
|
|
||||||
self.textedges = None
|
|
||||||
|
|
||||||
self._validate_columns()
|
|
||||||
self.edge_tol = edge_tol
|
|
||||||
self.row_tol = row_tol
|
|
||||||
self.column_tol = column_tol
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
# FRHTODO: Check if needed, refactor with Stream
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -832,10 +828,10 @@ class Hybrid(BaseParser):
|
||||||
))
|
))
|
||||||
|
|
||||||
# FRHTODO: Check is needed, refactor with Stream
|
# FRHTODO: Check is needed, refactor with Stream
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
tk,
|
bbox,
|
||||||
self.horizontal_text,
|
self.horizontal_text,
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
|
@ -908,7 +904,7 @@ class Hybrid(BaseParser):
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
return cols, rows
|
return cols, rows, None, None
|
||||||
|
|
||||||
# FRHTODO: Check is needed, refactor with Stream
|
# FRHTODO: Check is needed, refactor with Stream
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
|
|
@ -922,23 +918,3 @@ class Hybrid(BaseParser):
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self):
|
|
||||||
if self._document_has_no_text():
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
|
||||||
# populate table_bbox keys with these areas.
|
|
||||||
self._generate_table_bbox()
|
|
||||||
|
|
||||||
_tables = []
|
|
||||||
# sort tables based on y-coord
|
|
||||||
for table_idx, bbox in enumerate(
|
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
|
||||||
):
|
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
|
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
|
||||||
table._bbox = bbox
|
|
||||||
_tables.append(table)
|
|
||||||
|
|
||||||
return _tables
|
|
||||||
|
|
|
||||||
|
|
@ -252,21 +252,21 @@ class Lattice(BaseParser):
|
||||||
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
|
||||||
)
|
)
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, tk):
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
v_s, h_s = segments_in_bbox(
|
v_s, h_s = segments_in_bbox(
|
||||||
tk, self.vertical_segments, self.horizontal_segments
|
bbox, self.vertical_segments, self.horizontal_segments
|
||||||
)
|
)
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
tk,
|
bbox,
|
||||||
self.horizontal_text,
|
self.horizontal_text,
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
|
||||||
cols, rows = zip(*self.table_bbox[tk])
|
cols, rows = zip(*self.table_bbox[bbox])
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
cols.extend([tk[0], tk[2]])
|
cols.extend([bbox[0], bbox[2]])
|
||||||
rows.extend([tk[1], tk[3]])
|
rows.extend([bbox[1], bbox[3]])
|
||||||
# sort horizontal and vertical segments
|
# sort horizontal and vertical segments
|
||||||
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
|
||||||
rows = merge_close_lines(
|
rows = merge_close_lines(
|
||||||
|
|
@ -302,22 +302,3 @@ class Lattice(BaseParser):
|
||||||
table._textedges = None
|
table._textedges = None
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self):
|
|
||||||
if self._document_has_no_text():
|
|
||||||
return []
|
|
||||||
|
|
||||||
self._generate_table_bbox()
|
|
||||||
|
|
||||||
_tables = []
|
|
||||||
# sort tables based on y-coord
|
|
||||||
for table_idx, tk in enumerate(
|
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
|
||||||
):
|
|
||||||
cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
|
|
||||||
table = self._generate_table(
|
|
||||||
table_idx, cols, rows, v_s=v_s, h_s=h_s)
|
|
||||||
table._bbox = tk
|
|
||||||
_tables.append(table)
|
|
||||||
|
|
||||||
return _tables
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
|
|
@ -15,7 +15,7 @@ from ..utils import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Stream(BaseParser):
|
class Stream(TextBaseParser):
|
||||||
"""Stream method of parsing looks for spaces between text
|
"""Stream method of parsing looks for spaces between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
||||||
|
|
@ -71,15 +71,14 @@ class Stream(BaseParser):
|
||||||
"stream",
|
"stream",
|
||||||
table_regions=table_regions,
|
table_regions=table_regions,
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
|
columns=columns,
|
||||||
|
flag_size=flag_size,
|
||||||
split_text=split_text,
|
split_text=split_text,
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
flag_size=flag_size,
|
edge_tol=edge_tol,
|
||||||
|
row_tol=row_tol,
|
||||||
|
column_tol=column_tol,
|
||||||
)
|
)
|
||||||
self.columns = columns
|
|
||||||
self._validate_columns()
|
|
||||||
self.edge_tol = edge_tol
|
|
||||||
self.row_tol = row_tol
|
|
||||||
self.column_tol = column_tol
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _group_rows(text, row_tol=2):
|
def _group_rows(text, row_tol=2):
|
||||||
|
|
@ -302,10 +301,10 @@ class Stream(BaseParser):
|
||||||
table_bbox[bbox_from_str(area_str)] = None
|
table_bbox[bbox_from_str(area_str)] = None
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox = table_bbox
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, table_idx, tk):
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
# select elements which lie within table_bbox
|
# select elements which lie within table_bbox
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
tk,
|
bbox,
|
||||||
self.horizontal_text,
|
self.horizontal_text,
|
||||||
self.vertical_text
|
self.vertical_text
|
||||||
)
|
)
|
||||||
|
|
@ -378,7 +377,7 @@ class Stream(BaseParser):
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
return cols, rows
|
return cols, rows, None, None
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
|
|
@ -391,23 +390,3 @@ class Stream(BaseParser):
|
||||||
table._textedges = self.textedges
|
table._textedges = self.textedges
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def extract_tables(self):
|
|
||||||
if self._document_has_no_text():
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Identify plausible areas within the doc where tables lie,
|
|
||||||
# populate table_bbox keys with these areas.
|
|
||||||
self._generate_table_bbox()
|
|
||||||
|
|
||||||
_tables = []
|
|
||||||
# sort tables based on y-coord
|
|
||||||
for table_idx, bbox in enumerate(
|
|
||||||
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
|
|
||||||
):
|
|
||||||
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
|
|
||||||
table = self._generate_table(table_idx, cols, rows)
|
|
||||||
table._bbox = bbox
|
|
||||||
_tables.append(table)
|
|
||||||
|
|
||||||
return _tables
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue