Add edge close tolerance

pull/2/head
Vinayak Mehta 2018-12-20 13:58:54 +05:30
parent e89e147b5c
commit e0090fbb0a
3 changed files with 15 additions and 11 deletions

View File

@ -133,6 +133,8 @@ def lattice(c, *args, **kwargs):
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True,
help='X coordinates of column separators.')
@click.option('-e', '--edge_close_tol', default=50, help='Tolerance parameter'
' for extending textedges vertically.')
@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
' used to combine text vertically, to generate rows.')
@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'

View File

@ -13,8 +13,6 @@ import pandas as pd
# minimum number of vertical textline intersections for a textedge
# to be considered valid
TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending textedge
TEXTEDGE_EXTEND_TOLERANCE = 50
# padding added to table area on the left, right and bottom
TABLE_AREA_PADDING = 10
@ -55,11 +53,11 @@ class TextEdge(object):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0):
def update_coords(self, x, y0, edge_close_tol=50):
"""Updates the text edge's x and bottom y coordinates and sets
the is_valid attribute.
"""
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
if np.isclose(self.y0, y0, atol=edge_close_tol):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
@ -106,7 +104,7 @@ class TextEdges(object):
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def update(self, textline):
def update(self, textline, edge_close_tol=50):
"""Updates an existing text edge in the current dict.
"""
for align in ['left', 'right', 'middle']:
@ -115,15 +113,16 @@ class TextEdges(object):
if idx is None:
self.add(textline, align)
else:
self._textedges[align][idx].update_coords(x_coord, textline.y0)
self._textedges[align][idx].update_coords(
x_coord, textline.y0, edge_close_tol=edge_close_tol)
def generate(self, textlines):
def generate(self, textlines, edge_close_tol=50):
"""Generates the text edges dict based on horizontal text
rows.
"""
for tl in textlines:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update(tl)
self.update(tl, edge_close_tol=edge_close_tol)
def get_relevant(self):
"""Returns the list of relevant text edges (all share the same

View File

@ -38,6 +38,8 @@ class Stream(BaseParser):
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
edge_close_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_close_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
@ -47,12 +49,14 @@ class Stream(BaseParser):
"""
def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs):
flag_size=False, edge_close_tol=50, row_close_tol=2,
col_close_tol=0, **kwargs):
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
self.split_text = split_text
self.flag_size = flag_size
self.edge_close_tol = edge_close_tol
self.row_close_tol = row_close_tol
self.col_close_tol = col_close_tol
@ -248,13 +252,12 @@ class Stream(BaseParser):
Assumes that tables are situated relatively far apart
vertically.
"""
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges()
# generate left, middle and right textedges
textedges.generate(textlines)
textedges.generate(textlines, edge_close_tol=self.edge_close_tol)
# select relevant edges
relevant_textedges = textedges.get_relevant()
self.textedges.extend(relevant_textedges)