diff --git a/camelot/cli.py b/camelot/cli.py index 6c1b933..d83f2e1 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config) help='Split text that spans across multiple cells.') @click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' ' font size. Useful to detect super/subscripts.') +@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before' + ' assigning it to a cell.') @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), help='PDFMiner char_margin, line_margin and word_margin.') @click.pass_context diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index cfbbcda..83b876b 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -47,6 +47,9 @@ class Lattice(BaseParser): Direction in which text in a spanning cell will flow. split_text : bool, optional (default: False) Split text that spans across multiple cells. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. @@ -74,7 +77,7 @@ class Lattice(BaseParser): """ def __init__(self, table_areas=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], - split_text=False, flag_size=False, line_close_tol=2, + split_text=False, flag_size=False, strip_text='', line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, iterations=0, resolution=300, **kwargs): self.table_areas = table_areas @@ -84,6 +87,7 @@ class Lattice(BaseParser): self.shift_text = shift_text self.split_text = split_text self.flag_size = flag_size + self.strip_text = strip_text self.line_close_tol = line_close_tol self.joint_close_tol = joint_close_tol self.threshold_blocksize = threshold_blocksize diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index a7c5af4..45318f7 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -35,6 +35,9 @@ class Stream(BaseParser): are comma-separated. split_text : bool, optional (default: False) Split text that spans across multiple cells. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. @@ -49,13 +52,14 @@ class Stream(BaseParser): """ def __init__(self, table_areas=None, columns=None, split_text=False, - flag_size=False, edge_close_tol=50, row_close_tol=2, + flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2, col_close_tol=0, **kwargs): self.table_areas = table_areas self.columns = columns self._validate_columns() self.split_text = split_text self.flag_size = flag_size + self.strip_text = strip_text self.edge_close_tol = edge_close_tol self.row_close_tol = row_close_tol self.col_close_tol = col_close_tol diff --git a/camelot/utils.py b/camelot/utils.py index c38884f..f2c0bae 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2): return ret -def flag_font_size(textline, direction): +# TODO: combine the following functions into a TextProcessor class which +# applies corresponding transformations sequentially +# (inspired from sklearn.pipeline.Pipeline) + + +def flag_font_size(textline, direction, strip_text=''): """Flags super/subscripts in text by enclosing them with . May give false positives. @@ -319,6 +324,9 @@ def flag_font_size(textline, direction): List of PDFMiner LTChar objects. direction : string Direction of the PDFMiner LTTextLine object. + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -344,13 +352,13 @@ def flag_font_size(textline, direction): fchars = [t[0] for t in chars] if ''.join(fchars).strip(): flist.append(''.join(fchars)) - fstring = ''.join(flist) + fstring = ''.join(flist).strip(strip_text) else: - fstring = ''.join([t.get_text() for t in textline]) + fstring = ''.join([t.get_text() for t in textline]).strip(strip_text) return fstring -def split_textline(table, textline, direction, flag_size=False): +def split_textline(table, textline, direction, flag_size=False, strip_text=''): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False): Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts.) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False): grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: - grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction))) + grouped_chars.append((key[0], key[1], + flag_font_size([t[2] for t in chars], direction, strip_text=strip_text))) else: gchars = [t[2].get_text() for t in chars] - grouped_chars.append((key[0], key[1], ''.join(gchars))) + grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text))) return grouped_chars -def get_table_index(table, t, direction, split_text=False, flag_size=False): +def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',): """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. @@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts) + strip_text : str, optional (default: '') + Characters that should be stripped from a string before + assigning it to a cell. Returns ------- @@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False): error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea if split_text: - return split_textline(table, t, direction, flag_size=flag_size), error + return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error else: if flag_size: - return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error + return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error else: - return [(r_idx, c_idx, t.get_text())], error + return [(r_idx, c_idx, t.get_text().strip(strip_text))], error def compute_accuracy(error_weights):