Add strip_text

2018-12-20 16:32:16 +05:30 · 2018-12-20 16:32:16 +05:30 · f6aa21c31f
parent a38d52c7b2
commit f6aa21c31f
4 changed files with 37 additions and 12 deletions
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
              help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
              ' font size. Useful to detect super/subscripts.')
+@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
+              ' assigning it to a cell.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
              help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -47,6 +47,9 @@ class Lattice(BaseParser):
        Direction in which text in a spanning cell will flow.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
@ -74,7 +77,7 @@ class Lattice(BaseParser):
    """
    def __init__(self, table_areas=None, process_background=False,
                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
-                 split_text=False, flag_size=False, line_close_tol=2,
+                 split_text=False, flag_size=False, strip_text='', line_close_tol=2,
                 joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
                 iterations=0, resolution=300, **kwargs):
        self.table_areas = table_areas
@ -84,6 +87,7 @@ class Lattice(BaseParser):
        self.shift_text = shift_text
        self.split_text = split_text
        self.flag_size = flag_size
+        self.strip_text = strip_text
        self.line_close_tol = line_close_tol
        self.joint_close_tol = joint_close_tol
        self.threshold_blocksize = threshold_blocksize
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -35,6 +35,9 @@ class Stream(BaseParser):
        are comma-separated.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
@ -49,13 +52,14 @@ class Stream(BaseParser):

    """
    def __init__(self, table_areas=None, columns=None, split_text=False,
-                 flag_size=False, edge_close_tol=50, row_close_tol=2,
+                 flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2,
                 col_close_tol=0, **kwargs):
        self.table_areas = table_areas
        self.columns = columns
        self._validate_columns()
        self.split_text = split_text
        self.flag_size = flag_size
+        self.strip_text = strip_text
        self.edge_close_tol = edge_close_tol
        self.row_close_tol = row_close_tol
        self.col_close_tol = col_close_tol
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
    return ret


-def flag_font_size(textline, direction):
+# TODO: combine the following functions into a TextProcessor class which
+# applies corresponding transformations sequentially
+# (inspired from sklearn.pipeline.Pipeline)
+
+
+def flag_font_size(textline, direction, strip_text=''):
    """Flags super/subscripts in text by enclosing them with <s></s>.
    May give false positives.

@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
        List of PDFMiner LTChar objects.
    direction : string
        Direction of the PDFMiner LTTextLine object.
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.

    Returns
    -------
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
                fchars = [t[0] for t in chars]
                if ''.join(fchars).strip():
                    flist.append(''.join(fchars))
-        fstring = ''.join(flist)
+        fstring = ''.join(flist).strip(strip_text)
    else:
-        fstring = ''.join([t.get_text() for t in textline])
+        fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
    return fstring


-def split_textline(table, textline, direction, flag_size=False):
+def split_textline(table, textline, direction, flag_size=False, strip_text=''):
    """Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.

@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts.)
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.

    Returns
    -------
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
    grouped_chars = []
    for key, chars in groupby(cut_text, itemgetter(0, 1)):
        if flag_size:
-            grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
+            grouped_chars.append((key[0], key[1],
+                flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
        else:
            gchars = [t[2].get_text() for t in chars]
-            grouped_chars.append((key[0], key[1], ''.join(gchars)))
+            grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
    return grouped_chars


-def get_table_index(table, t, direction, split_text=False, flag_size=False):
+def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
    """Gets indices of the table cell where given text object lies by
    comparing their y and x-coordinates.

@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts)
+    strip_text : str, optional (default: '')
+        Characters that should be stripped from a string before
+        assigning it to a cell.

    Returns
    -------
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
    error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea

    if split_text:
-        return split_textline(table, t, direction, flag_size=flag_size), error
+        return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
    else:
        if flag_size:
-            return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
+            return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
        else:
-            return [(r_idx, c_idx, t.get_text())], error
+            return [(r_idx, c_idx, t.get_text().strip(strip_text))], error


 def compute_accuracy(error_weights):