Further refactor
Move common parse error stats computation to base parser Move copy_spanning_text logic to the tablepull/153/head
parent
583868756a
commit
cff7a9698b
|
|
@ -419,6 +419,12 @@ class Table(object):
|
|||
self.flavor = parser.id
|
||||
self.filename = parser.filename
|
||||
self.debug_info = parser.debug_info
|
||||
pos_errors = parser.compute_parse_errors(self)
|
||||
self.accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
if parser.copy_text is not None:
|
||||
self.copy_spanning_text(parser.copy_text)
|
||||
|
||||
data = self.data
|
||||
self.df = pd.DataFrame(data)
|
||||
self.shape = self.df.shape
|
||||
|
|
@ -712,6 +718,37 @@ class Table(object):
|
|||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def copy_spanning_text(self, copy_text=None):
|
||||
"""Copies over text in empty spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : camelot.core.Table
|
||||
|
||||
"""
|
||||
for f in copy_text:
|
||||
if f == "h":
|
||||
for i in range(len(self.cells)):
|
||||
for j in range(len(self.cells[i])):
|
||||
if self.cells[i][j].text.strip() == "":
|
||||
if self.cells[i][j].hspan and not self.cells[i][j].left:
|
||||
self.cells[i][j].text = self.cells[i][j - 1].text
|
||||
elif f == "v":
|
||||
for i in range(len(self.cells)):
|
||||
for j in range(len(self.cells[i])):
|
||||
if self.cells[i][j].text.strip() == "":
|
||||
if self.cells[i][j].vspan and not self.cells[i][j].top:
|
||||
self.cells[i][j].text = self.cells[i - 1][j].text
|
||||
return self
|
||||
|
||||
|
||||
class TableList(object):
|
||||
"""Defines a list of camelot.core.Table objects. Each table can
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ class BaseParser(object):
|
|||
parser_id,
|
||||
table_regions=None,
|
||||
table_areas=None,
|
||||
copy_text=None,
|
||||
split_text=False,
|
||||
strip_text="",
|
||||
shift_text=None,
|
||||
|
|
@ -25,6 +26,7 @@ class BaseParser(object):
|
|||
self.table_regions = table_regions
|
||||
self.table_areas = table_areas
|
||||
|
||||
self.copy_text = copy_text
|
||||
self.split_text = split_text
|
||||
self.strip_text = strip_text
|
||||
self.shift_text = shift_text
|
||||
|
|
@ -86,7 +88,7 @@ class BaseParser(object):
|
|||
"""
|
||||
return idx
|
||||
|
||||
def _compute_parse_errors(self, table):
|
||||
def compute_parse_errors(self, table):
|
||||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
|
|
|
|||
|
|
@ -120,12 +120,12 @@ class Lattice(BaseParser):
|
|||
table_areas=table_areas,
|
||||
split_text=split_text,
|
||||
strip_text=strip_text,
|
||||
copy_text=copy_text,
|
||||
shift_text=shift_text or ["l", "t"],
|
||||
flag_size=flag_size,
|
||||
)
|
||||
self.process_background = process_background
|
||||
self.line_scale = line_scale
|
||||
self.copy_text = copy_text
|
||||
self.line_tol = line_tol
|
||||
self.joint_tol = joint_tol
|
||||
self.threshold_blocksize = threshold_blocksize
|
||||
|
|
@ -180,40 +180,6 @@ class Lattice(BaseParser):
|
|||
indices.append((r_idx, c_idx, text))
|
||||
return indices
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _copy_spanning_text(t, copy_text=None):
|
||||
"""Copies over text in empty spanning cells.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
t : camelot.core.Table
|
||||
copy_text : list, optional (default: None)
|
||||
{'h', 'v'}
|
||||
Select one or more strings from above and pass them as a list
|
||||
to specify the direction in which text should be copied over
|
||||
when a cell spans multiple rows or columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : camelot.core.Table
|
||||
|
||||
"""
|
||||
for f in copy_text:
|
||||
if f == "h":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == "":
|
||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
||||
elif f == "v":
|
||||
for i in range(len(t.cells)):
|
||||
for j in range(len(t.cells[i])):
|
||||
if t.cells[i][j].text.strip() == "":
|
||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||
return t
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
def scale_areas(areas):
|
||||
scaled_areas = []
|
||||
|
|
@ -342,37 +308,7 @@ class Lattice(BaseParser):
|
|||
# set spanning cells to True
|
||||
table = table.set_span()
|
||||
|
||||
pos_errors = []
|
||||
# TODO: have a single list in place of two directional ones?
|
||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||
for direction in ["vertical", "horizontal"]:
|
||||
for t in self.t_bbox[direction]:
|
||||
indices, error = get_table_index(
|
||||
table,
|
||||
t,
|
||||
direction,
|
||||
split_text=self.split_text,
|
||||
flag_size=self.flag_size,
|
||||
strip_text=self.strip_text,
|
||||
)
|
||||
if indices[:2] != (-1, -1):
|
||||
pos_errors.append(error)
|
||||
indices = Lattice._reduce_index(
|
||||
table, indices, shift_text=self.shift_text
|
||||
)
|
||||
for r_idx, c_idx, text in indices:
|
||||
table.cells[r_idx][c_idx].text = text
|
||||
# FRHTODO
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
if self.copy_text is not None:
|
||||
table = Lattice._copy_spanning_text(
|
||||
table,
|
||||
copy_text=self.copy_text
|
||||
)
|
||||
|
||||
table.record_parse_metadata(self)
|
||||
table.accuracy = accuracy
|
||||
|
||||
# for plotting
|
||||
_text = []
|
||||
|
|
|
|||
|
|
@ -419,14 +419,8 @@ class Stream(BaseParser):
|
|||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||
table = self._initialize_new_table(table_idx, cols, rows)
|
||||
table = table.set_all_edges()
|
||||
|
||||
pos_errors = self._compute_parse_errors(table)
|
||||
accuracy = compute_accuracy([[100, pos_errors]])
|
||||
|
||||
table.record_parse_metadata(self)
|
||||
|
||||
table.accuracy = accuracy
|
||||
|
||||
# for plotting
|
||||
_text = []
|
||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
|
|
|
|||
Loading…
Reference in New Issue