Further refactor

Move common parse error stats computation to base parser
Move copy_spanning_text logic to the table
pull/153/head
Frh 2020-04-19 13:28:17 -07:00
parent 583868756a
commit cff7a9698b
4 changed files with 41 additions and 72 deletions

View File

@ -419,6 +419,12 @@ class Table(object):
self.flavor = parser.id self.flavor = parser.id
self.filename = parser.filename self.filename = parser.filename
self.debug_info = parser.debug_info self.debug_info = parser.debug_info
pos_errors = parser.compute_parse_errors(self)
self.accuracy = compute_accuracy([[100, pos_errors]])
if parser.copy_text is not None:
self.copy_spanning_text(parser.copy_text)
data = self.data data = self.data
self.df = pd.DataFrame(data) self.df = pd.DataFrame(data)
self.shape = self.df.shape self.shape = self.df.shape
@ -712,6 +718,37 @@ class Table(object):
conn.commit() conn.commit()
conn.close() conn.close()
def copy_spanning_text(self, copy_text=None):
"""Copies over text in empty spanning cells.
Parameters
----------
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
"""
for f in copy_text:
if f == "h":
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
if self.cells[i][j].text.strip() == "":
if self.cells[i][j].hspan and not self.cells[i][j].left:
self.cells[i][j].text = self.cells[i][j - 1].text
elif f == "v":
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
if self.cells[i][j].text.strip() == "":
if self.cells[i][j].vspan and not self.cells[i][j].top:
self.cells[i][j].text = self.cells[i - 1][j].text
return self
class TableList(object): class TableList(object):
"""Defines a list of camelot.core.Table objects. Each table can """Defines a list of camelot.core.Table objects. Each table can

View File

@ -16,6 +16,7 @@ class BaseParser(object):
parser_id, parser_id,
table_regions=None, table_regions=None,
table_areas=None, table_areas=None,
copy_text=None,
split_text=False, split_text=False,
strip_text="", strip_text="",
shift_text=None, shift_text=None,
@ -25,6 +26,7 @@ class BaseParser(object):
self.table_regions = table_regions self.table_regions = table_regions
self.table_areas = table_areas self.table_areas = table_areas
self.copy_text = copy_text
self.split_text = split_text self.split_text = split_text
self.strip_text = strip_text self.strip_text = strip_text
self.shift_text = shift_text self.shift_text = shift_text
@ -86,7 +88,7 @@ class BaseParser(object):
""" """
return idx return idx
def _compute_parse_errors(self, table): def compute_parse_errors(self, table):
pos_errors = [] pos_errors = []
# TODO: have a single list in place of two directional ones? # TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL # sorted on x-coordinate based on reading order i.e. LTR or RTL

View File

@ -120,12 +120,12 @@ class Lattice(BaseParser):
table_areas=table_areas, table_areas=table_areas,
split_text=split_text, split_text=split_text,
strip_text=strip_text, strip_text=strip_text,
copy_text=copy_text,
shift_text=shift_text or ["l", "t"], shift_text=shift_text or ["l", "t"],
flag_size=flag_size, flag_size=flag_size,
) )
self.process_background = process_background self.process_background = process_background
self.line_scale = line_scale self.line_scale = line_scale
self.copy_text = copy_text
self.line_tol = line_tol self.line_tol = line_tol
self.joint_tol = joint_tol self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize self.threshold_blocksize = threshold_blocksize
@ -180,40 +180,6 @@ class Lattice(BaseParser):
indices.append((r_idx, c_idx, text)) indices.append((r_idx, c_idx, text))
return indices return indices
@staticmethod
def _copy_spanning_text(t, copy_text=None):
"""Copies over text in empty spanning cells.
Parameters
----------
t : camelot.core.Table
copy_text : list, optional (default: None)
{'h', 'v'}
Select one or more strings from above and pass them as a list
to specify the direction in which text should be copied over
when a cell spans multiple rows or columns.
Returns
-------
t : camelot.core.Table
"""
for f in copy_text:
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].text.strip() == "":
if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].text = t.cells[i - 1][j].text
return t
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas): def scale_areas(areas):
scaled_areas = [] scaled_areas = []
@ -342,37 +308,7 @@ class Lattice(BaseParser):
# set spanning cells to True # set spanning cells to True
table = table.set_span() table = table.set_span()
pos_errors = []
# TODO: have a single list in place of two directional ones?
# sorted on x-coordinate based on reading order i.e. LTR or RTL
for direction in ["vertical", "horizontal"]:
for t in self.t_bbox[direction]:
indices, error = get_table_index(
table,
t,
direction,
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
)
for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].text = text
# FRHTODO
accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table = Lattice._copy_spanning_text(
table,
copy_text=self.copy_text
)
table.record_parse_metadata(self) table.record_parse_metadata(self)
table.accuracy = accuracy
# for plotting # for plotting
_text = [] _text = []

View File

@ -419,14 +419,8 @@ class Stream(BaseParser):
def _generate_table(self, table_idx, cols, rows, **kwargs): def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows) table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges() table = table.set_all_edges()
pos_errors = self._compute_parse_errors(table)
accuracy = compute_accuracy([[100, pos_errors]])
table.record_parse_metadata(self) table.record_parse_metadata(self)
table.accuracy = accuracy
# for plotting # for plotting
_text = [] _text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])