Add edge close tolerance
parent
e89e147b5c
commit
e0090fbb0a
|
|
@ -133,6 +133,8 @@ def lattice(c, *args, **kwargs):
|
||||||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||||
@click.option('-C', '--columns', default=[], multiple=True,
|
@click.option('-C', '--columns', default=[], multiple=True,
|
||||||
help='X coordinates of column separators.')
|
help='X coordinates of column separators.')
|
||||||
|
@click.option('-e', '--edge_close_tol', default=50, help='Tolerance parameter'
|
||||||
|
' for extending textedges vertically.')
|
||||||
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
||||||
' used to combine text vertically, to generate rows.')
|
' used to combine text vertically, to generate rows.')
|
||||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,6 @@ import pandas as pd
|
||||||
# minimum number of vertical textline intersections for a textedge
|
# minimum number of vertical textline intersections for a textedge
|
||||||
# to be considered valid
|
# to be considered valid
|
||||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
# y coordinate tolerance for extending textedge
|
|
||||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
|
||||||
# padding added to table area on the left, right and bottom
|
# padding added to table area on the left, right and bottom
|
||||||
TABLE_AREA_PADDING = 10
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
@ -55,11 +53,11 @@ class TextEdge(object):
|
||||||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
||||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||||
|
|
||||||
def update_coords(self, x, y0):
|
def update_coords(self, x, y0, edge_close_tol=50):
|
||||||
"""Updates the text edge's x and bottom y coordinates and sets
|
"""Updates the text edge's x and bottom y coordinates and sets
|
||||||
the is_valid attribute.
|
the is_valid attribute.
|
||||||
"""
|
"""
|
||||||
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
if np.isclose(self.y0, y0, atol=edge_close_tol):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.intersections += 1
|
self.intersections += 1
|
||||||
|
|
@ -106,7 +104,7 @@ class TextEdges(object):
|
||||||
te = TextEdge(x, y0, y1, align=align)
|
te = TextEdge(x, y0, y1, align=align)
|
||||||
self._textedges[align].append(te)
|
self._textedges[align].append(te)
|
||||||
|
|
||||||
def update(self, textline):
|
def update(self, textline, edge_close_tol=50):
|
||||||
"""Updates an existing text edge in the current dict.
|
"""Updates an existing text edge in the current dict.
|
||||||
"""
|
"""
|
||||||
for align in ['left', 'right', 'middle']:
|
for align in ['left', 'right', 'middle']:
|
||||||
|
|
@ -115,15 +113,16 @@ class TextEdges(object):
|
||||||
if idx is None:
|
if idx is None:
|
||||||
self.add(textline, align)
|
self.add(textline, align)
|
||||||
else:
|
else:
|
||||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
self._textedges[align][idx].update_coords(
|
||||||
|
x_coord, textline.y0, edge_close_tol=edge_close_tol)
|
||||||
|
|
||||||
def generate(self, textlines):
|
def generate(self, textlines, edge_close_tol=50):
|
||||||
"""Generates the text edges dict based on horizontal text
|
"""Generates the text edges dict based on horizontal text
|
||||||
rows.
|
rows.
|
||||||
"""
|
"""
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update(tl)
|
self.update(tl, edge_close_tol=edge_close_tol)
|
||||||
|
|
||||||
def get_relevant(self):
|
def get_relevant(self):
|
||||||
"""Returns the list of relevant text edges (all share the same
|
"""Returns the list of relevant text edges (all share the same
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,8 @@ class Stream(BaseParser):
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
edge_close_tol : int, optional (default: 50)
|
||||||
|
Tolerance parameter for extending textedges vertically.
|
||||||
row_close_tol : int, optional (default: 2)
|
row_close_tol : int, optional (default: 2)
|
||||||
Tolerance parameter used to combine text vertically,
|
Tolerance parameter used to combine text vertically,
|
||||||
to generate rows.
|
to generate rows.
|
||||||
|
|
@ -47,12 +49,14 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
|
flag_size=False, edge_close_tol=50, row_close_tol=2,
|
||||||
|
col_close_tol=0, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
|
self.edge_close_tol = edge_close_tol
|
||||||
self.row_close_tol = row_close_tol
|
self.row_close_tol = row_close_tol
|
||||||
self.col_close_tol = col_close_tol
|
self.col_close_tol = col_close_tol
|
||||||
|
|
||||||
|
|
@ -248,13 +252,12 @@ class Stream(BaseParser):
|
||||||
Assumes that tables are situated relatively far apart
|
Assumes that tables are situated relatively far apart
|
||||||
vertically.
|
vertically.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
textedges = TextEdges()
|
textedges = TextEdges()
|
||||||
# generate left, middle and right textedges
|
# generate left, middle and right textedges
|
||||||
textedges.generate(textlines)
|
textedges.generate(textlines, edge_close_tol=self.edge_close_tol)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
relevant_textedges = textedges.get_relevant()
|
relevant_textedges = textedges.get_relevant()
|
||||||
self.textedges.extend(relevant_textedges)
|
self.textedges.extend(relevant_textedges)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue