Add edge close tolerance
parent
e89e147b5c
commit
e0090fbb0a
|
|
@ -133,6 +133,8 @@ def lattice(c, *args, **kwargs):
|
|||
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
|
||||
@click.option('-C', '--columns', default=[], multiple=True,
|
||||
help='X coordinates of column separators.')
|
||||
@click.option('-e', '--edge_close_tol', default=50, help='Tolerance parameter'
|
||||
' for extending textedges vertically.')
|
||||
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
|
||||
' used to combine text vertically, to generate rows.')
|
||||
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
|
||||
|
|
|
|||
|
|
@ -13,8 +13,6 @@ import pandas as pd
|
|||
# minimum number of vertical textline intersections for a textedge
|
||||
# to be considered valid
|
||||
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||
# y coordinate tolerance for extending textedge
|
||||
TEXTEDGE_EXTEND_TOLERANCE = 50
|
||||
# padding added to table area on the left, right and bottom
|
||||
TABLE_AREA_PADDING = 10
|
||||
|
||||
|
|
@ -55,11 +53,11 @@ class TextEdge(object):
|
|||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||
|
||||
def update_coords(self, x, y0):
|
||||
def update_coords(self, x, y0, edge_close_tol=50):
|
||||
"""Updates the text edge's x and bottom y coordinates and sets
|
||||
the is_valid attribute.
|
||||
"""
|
||||
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
||||
if np.isclose(self.y0, y0, atol=edge_close_tol):
|
||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||
self.y0 = y0
|
||||
self.intersections += 1
|
||||
|
|
@ -106,7 +104,7 @@ class TextEdges(object):
|
|||
te = TextEdge(x, y0, y1, align=align)
|
||||
self._textedges[align].append(te)
|
||||
|
||||
def update(self, textline):
|
||||
def update(self, textline, edge_close_tol=50):
|
||||
"""Updates an existing text edge in the current dict.
|
||||
"""
|
||||
for align in ['left', 'right', 'middle']:
|
||||
|
|
@ -115,15 +113,16 @@ class TextEdges(object):
|
|||
if idx is None:
|
||||
self.add(textline, align)
|
||||
else:
|
||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
||||
self._textedges[align][idx].update_coords(
|
||||
x_coord, textline.y0, edge_close_tol=edge_close_tol)
|
||||
|
||||
def generate(self, textlines):
|
||||
def generate(self, textlines, edge_close_tol=50):
|
||||
"""Generates the text edges dict based on horizontal text
|
||||
rows.
|
||||
"""
|
||||
for tl in textlines:
|
||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||
self.update(tl)
|
||||
self.update(tl, edge_close_tol=edge_close_tol)
|
||||
|
||||
def get_relevant(self):
|
||||
"""Returns the list of relevant text edges (all share the same
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ class Stream(BaseParser):
|
|||
flag_size : bool, optional (default: False)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
edge_close_tol : int, optional (default: 50)
|
||||
Tolerance parameter for extending textedges vertically.
|
||||
row_close_tol : int, optional (default: 2)
|
||||
Tolerance parameter used to combine text vertically,
|
||||
to generate rows.
|
||||
|
|
@ -47,12 +49,14 @@ class Stream(BaseParser):
|
|||
|
||||
"""
|
||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
|
||||
flag_size=False, edge_close_tol=50, row_close_tol=2,
|
||||
col_close_tol=0, **kwargs):
|
||||
self.table_areas = table_areas
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.edge_close_tol = edge_close_tol
|
||||
self.row_close_tol = row_close_tol
|
||||
self.col_close_tol = col_close_tol
|
||||
|
||||
|
|
@ -248,13 +252,12 @@ class Stream(BaseParser):
|
|||
Assumes that tables are situated relatively far apart
|
||||
vertically.
|
||||
"""
|
||||
|
||||
# TODO: add support for arabic text #141
|
||||
# sort textlines in reading order
|
||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||
textedges = TextEdges()
|
||||
# generate left, middle and right textedges
|
||||
textedges.generate(textlines)
|
||||
textedges.generate(textlines, edge_close_tol=self.edge_close_tol)
|
||||
# select relevant edges
|
||||
relevant_textedges = textedges.get_relevant()
|
||||
self.textedges.extend(relevant_textedges)
|
||||
|
|
|
|||
Loading…
Reference in New Issue