Further refactor
Move common parse error stats computation to base parser Move copy_spanning_text logic to the tablepull/153/head
parent
37483ca202
commit
ff2ce6f47c
|
|
@ -419,6 +419,12 @@ class Table(object):
|
||||||
self.flavor = parser.id
|
self.flavor = parser.id
|
||||||
self.filename = parser.filename
|
self.filename = parser.filename
|
||||||
self.debug_info = parser.debug_info
|
self.debug_info = parser.debug_info
|
||||||
|
pos_errors = parser.compute_parse_errors(self)
|
||||||
|
self.accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
|
if parser.copy_text is not None:
|
||||||
|
self.copy_spanning_text(parser.copy_text)
|
||||||
|
|
||||||
data = self.data
|
data = self.data
|
||||||
self.df = pd.DataFrame(data)
|
self.df = pd.DataFrame(data)
|
||||||
self.shape = self.df.shape
|
self.shape = self.df.shape
|
||||||
|
|
@ -712,6 +718,37 @@ class Table(object):
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
def copy_spanning_text(self, copy_text=None):
|
||||||
|
"""Copies over text in empty spanning cells.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
copy_text : list, optional (default: None)
|
||||||
|
{'h', 'v'}
|
||||||
|
Select one or more strings from above and pass them as a list
|
||||||
|
to specify the direction in which text should be copied over
|
||||||
|
when a cell spans multiple rows or columns.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
t : camelot.core.Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
for f in copy_text:
|
||||||
|
if f == "h":
|
||||||
|
for i in range(len(self.cells)):
|
||||||
|
for j in range(len(self.cells[i])):
|
||||||
|
if self.cells[i][j].text.strip() == "":
|
||||||
|
if self.cells[i][j].hspan and not self.cells[i][j].left:
|
||||||
|
self.cells[i][j].text = self.cells[i][j - 1].text
|
||||||
|
elif f == "v":
|
||||||
|
for i in range(len(self.cells)):
|
||||||
|
for j in range(len(self.cells[i])):
|
||||||
|
if self.cells[i][j].text.strip() == "":
|
||||||
|
if self.cells[i][j].vspan and not self.cells[i][j].top:
|
||||||
|
self.cells[i][j].text = self.cells[i - 1][j].text
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class TableList(object):
|
class TableList(object):
|
||||||
"""Defines a list of camelot.core.Table objects. Each table can
|
"""Defines a list of camelot.core.Table objects. Each table can
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ class BaseParser(object):
|
||||||
parser_id,
|
parser_id,
|
||||||
table_regions=None,
|
table_regions=None,
|
||||||
table_areas=None,
|
table_areas=None,
|
||||||
|
copy_text=None,
|
||||||
split_text=False,
|
split_text=False,
|
||||||
strip_text="",
|
strip_text="",
|
||||||
shift_text=None,
|
shift_text=None,
|
||||||
|
|
@ -25,6 +26,7 @@ class BaseParser(object):
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
|
|
||||||
|
self.copy_text = copy_text
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.strip_text = strip_text
|
self.strip_text = strip_text
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
|
|
@ -86,7 +88,7 @@ class BaseParser(object):
|
||||||
"""
|
"""
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
def _compute_parse_errors(self, table):
|
def compute_parse_errors(self, table):
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
# TODO: have a single list in place of two directional ones?
|
# TODO: have a single list in place of two directional ones?
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
||||||
|
|
|
||||||
|
|
@ -120,12 +120,12 @@ class Lattice(BaseParser):
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
split_text=split_text,
|
split_text=split_text,
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
|
copy_text=copy_text,
|
||||||
shift_text=shift_text or ["l", "t"],
|
shift_text=shift_text or ["l", "t"],
|
||||||
flag_size=flag_size,
|
flag_size=flag_size,
|
||||||
)
|
)
|
||||||
self.process_background = process_background
|
self.process_background = process_background
|
||||||
self.line_scale = line_scale
|
self.line_scale = line_scale
|
||||||
self.copy_text = copy_text
|
|
||||||
self.line_tol = line_tol
|
self.line_tol = line_tol
|
||||||
self.joint_tol = joint_tol
|
self.joint_tol = joint_tol
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
|
|
@ -180,40 +180,6 @@ class Lattice(BaseParser):
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _copy_spanning_text(t, copy_text=None):
|
|
||||||
"""Copies over text in empty spanning cells.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t : camelot.core.Table
|
|
||||||
copy_text : list, optional (default: None)
|
|
||||||
{'h', 'v'}
|
|
||||||
Select one or more strings from above and pass them as a list
|
|
||||||
to specify the direction in which text should be copied over
|
|
||||||
when a cell spans multiple rows or columns.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
t : camelot.core.Table
|
|
||||||
|
|
||||||
"""
|
|
||||||
for f in copy_text:
|
|
||||||
if f == "h":
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
for j in range(len(t.cells[i])):
|
|
||||||
if t.cells[i][j].text.strip() == "":
|
|
||||||
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
|
||||||
t.cells[i][j].text = t.cells[i][j - 1].text
|
|
||||||
elif f == "v":
|
|
||||||
for i in range(len(t.cells)):
|
|
||||||
for j in range(len(t.cells[i])):
|
|
||||||
if t.cells[i][j].text.strip() == "":
|
|
||||||
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
|
||||||
t.cells[i][j].text = t.cells[i - 1][j].text
|
|
||||||
return t
|
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
scaled_areas = []
|
scaled_areas = []
|
||||||
|
|
@ -342,37 +308,7 @@ class Lattice(BaseParser):
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_span()
|
table = table.set_span()
|
||||||
|
|
||||||
pos_errors = []
|
|
||||||
# TODO: have a single list in place of two directional ones?
|
|
||||||
# sorted on x-coordinate based on reading order i.e. LTR or RTL
|
|
||||||
for direction in ["vertical", "horizontal"]:
|
|
||||||
for t in self.t_bbox[direction]:
|
|
||||||
indices, error = get_table_index(
|
|
||||||
table,
|
|
||||||
t,
|
|
||||||
direction,
|
|
||||||
split_text=self.split_text,
|
|
||||||
flag_size=self.flag_size,
|
|
||||||
strip_text=self.strip_text,
|
|
||||||
)
|
|
||||||
if indices[:2] != (-1, -1):
|
|
||||||
pos_errors.append(error)
|
|
||||||
indices = Lattice._reduce_index(
|
|
||||||
table, indices, shift_text=self.shift_text
|
|
||||||
)
|
|
||||||
for r_idx, c_idx, text in indices:
|
|
||||||
table.cells[r_idx][c_idx].text = text
|
|
||||||
# FRHTODO
|
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
|
||||||
|
|
||||||
if self.copy_text is not None:
|
|
||||||
table = Lattice._copy_spanning_text(
|
|
||||||
table,
|
|
||||||
copy_text=self.copy_text
|
|
||||||
)
|
|
||||||
|
|
||||||
table.record_parse_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
table.accuracy = accuracy
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
|
|
|
||||||
|
|
@ -419,14 +419,8 @@ class Stream(BaseParser):
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
|
|
||||||
pos_errors = self._compute_parse_errors(table)
|
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
|
||||||
|
|
||||||
table.record_parse_metadata(self)
|
table.record_parse_metadata(self)
|
||||||
|
|
||||||
table.accuracy = accuracy
|
|
||||||
|
|
||||||
# for plotting
|
# for plotting
|
||||||
_text = []
|
_text = []
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue