From e0090fbb0a350db970ce5f81c35a2e0eb546d2c8 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 20 Dec 2018 13:58:54 +0530 Subject: [PATCH] Add edge close tolerance --- camelot/cli.py | 2 ++ camelot/core.py | 15 +++++++-------- camelot/parsers/stream.py | 9 ++++++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index e978a3c..b0832f4 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -133,6 +133,8 @@ def lattice(c, *args, **kwargs): ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') @click.option('-C', '--columns', default=[], multiple=True, help='X coordinates of column separators.') +@click.option('-e', '--edge_close_tol', default=50, help='Tolerance parameter' + ' for extending textedges vertically.') @click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' ' used to combine text vertically, to generate rows.') @click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' diff --git a/camelot/core.py b/camelot/core.py index ac63e54..99268e1 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -13,8 +13,6 @@ import pandas as pd # minimum number of vertical textline intersections for a textedge # to be considered valid TEXTEDGE_REQUIRED_ELEMENTS = 4 -# y coordinate tolerance for extending textedge -TEXTEDGE_EXTEND_TOLERANCE = 50 # padding added to table area on the left, right and bottom TABLE_AREA_PADDING = 10 @@ -55,11 +53,11 @@ class TextEdge(object): return ''.format( round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) - def update_coords(self, x, y0): + def update_coords(self, x, y0, edge_close_tol=50): """Updates the text edge's x and bottom y coordinates and sets the is_valid attribute. """ - if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE): + if np.isclose(self.y0, y0, atol=edge_close_tol): self.x = (self.intersections * self.x + x) / float(self.intersections + 1) self.y0 = y0 self.intersections += 1 @@ -106,7 +104,7 @@ class TextEdges(object): te = TextEdge(x, y0, y1, align=align) self._textedges[align].append(te) - def update(self, textline): + def update(self, textline, edge_close_tol=50): """Updates an existing text edge in the current dict. """ for align in ['left', 'right', 'middle']: @@ -115,15 +113,16 @@ class TextEdges(object): if idx is None: self.add(textline, align) else: - self._textedges[align][idx].update_coords(x_coord, textline.y0) + self._textedges[align][idx].update_coords( + x_coord, textline.y0, edge_close_tol=edge_close_tol) - def generate(self, textlines): + def generate(self, textlines, edge_close_tol=50): """Generates the text edges dict based on horizontal text rows. """ for tl in textlines: if len(tl.get_text().strip()) > 1: # TODO: hacky - self.update(tl) + self.update(tl, edge_close_tol=edge_close_tol) def get_relevant(self): """Returns the list of relevant text edges (all share the same diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 5ebd2df..1f5a856 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -38,6 +38,8 @@ class Stream(BaseParser): flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. + edge_close_tol : int, optional (default: 50) + Tolerance parameter for extending textedges vertically. row_close_tol : int, optional (default: 2) Tolerance parameter used to combine text vertically, to generate rows. @@ -47,12 +49,14 @@ class Stream(BaseParser): """ def __init__(self, table_areas=None, columns=None, split_text=False, - flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs): + flag_size=False, edge_close_tol=50, row_close_tol=2, + col_close_tol=0, **kwargs): self.table_areas = table_areas self.columns = columns self._validate_columns() self.split_text = split_text self.flag_size = flag_size + self.edge_close_tol = edge_close_tol self.row_close_tol = row_close_tol self.col_close_tol = col_close_tol @@ -248,13 +252,12 @@ class Stream(BaseParser): Assumes that tables are situated relatively far apart vertically. """ - # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) textedges = TextEdges() # generate left, middle and right textedges - textedges.generate(textlines) + textedges.generate(textlines, edge_close_tol=self.edge_close_tol) # select relevant edges relevant_textedges = textedges.get_relevant() self.textedges.extend(relevant_textedges)