Add strip_text

pull/2/head
Vinayak Mehta 2018-12-20 16:32:16 +05:30
parent a38d52c7b2
commit f6aa21c31f
4 changed files with 37 additions and 12 deletions

View File

@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
help='Split text that spans across multiple cells.')
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
' font size. Useful to detect super/subscripts.')
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
' assigning it to a cell.')
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
help='PDFMiner char_margin, line_margin and word_margin.')
@click.pass_context

View File

@ -47,6 +47,9 @@ class Lattice(BaseParser):
Direction in which text in a spanning cell will flow.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
@ -74,7 +77,7 @@ class Lattice(BaseParser):
"""
def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2,
split_text=False, flag_size=False, strip_text='', line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs):
self.table_areas = table_areas
@ -84,6 +87,7 @@ class Lattice(BaseParser):
self.shift_text = shift_text
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.line_close_tol = line_close_tol
self.joint_close_tol = joint_close_tol
self.threshold_blocksize = threshold_blocksize

View File

@ -35,6 +35,9 @@ class Stream(BaseParser):
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
@ -49,13 +52,14 @@ class Stream(BaseParser):
"""
def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, edge_close_tol=50, row_close_tol=2,
flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2,
col_close_tol=0, **kwargs):
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.edge_close_tol = edge_close_tol
self.row_close_tol = row_close_tol
self.col_close_tol = col_close_tol

View File

@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
return ret
def flag_font_size(textline, direction):
# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)
def flag_font_size(textline, direction, strip_text=''):
"""Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.
@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
List of PDFMiner LTChar objects.
direction : string
Direction of the PDFMiner LTTextLine object.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
fchars = [t[0] for t in chars]
if ''.join(fchars).strip():
flist.append(''.join(fchars))
fstring = ''.join(flist)
fstring = ''.join(flist).strip(strip_text)
else:
fstring = ''.join([t.get_text() for t in textline])
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
return fstring
def split_textline(table, textline, direction, flag_size=False):
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts.)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
grouped_chars = []
for key, chars in groupby(cut_text, itemgetter(0, 1)):
if flag_size:
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
grouped_chars.append((key[0], key[1],
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append((key[0], key[1], ''.join(gchars)))
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
return grouped_chars
def get_table_index(table, t, direction, split_text=False, flag_size=False):
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
Whether or not to highlight a substring using <s></s>
if its size is different from rest of the string. (Useful for
super and subscripts)
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
Returns
-------
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
if split_text:
return split_textline(table, t, direction, flag_size=flag_size), error
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
else:
if flag_size:
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
else:
return [(r_idx, c_idx, t.get_text())], error
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
def compute_accuracy(error_weights):