Add strip_text
parent
a38d52c7b2
commit
f6aa21c31f
|
|
@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
|
|||
help='Split text that spans across multiple cells.')
|
||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||
' font size. Useful to detect super/subscripts.')
|
||||
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
|
||||
' assigning it to a cell.')
|
||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||
@click.pass_context
|
||||
|
|
|
|||
|
|
@ -47,6 +47,9 @@ class Lattice(BaseParser):
|
|||
Direction in which text in a spanning cell will flow.
|
||||
split_text : bool, optional (default: False)
|
||||
Split text that spans across multiple cells.
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
flag_size : bool, optional (default: False)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
|
|
@ -74,7 +77,7 @@ class Lattice(BaseParser):
|
|||
"""
|
||||
def __init__(self, table_areas=None, process_background=False,
|
||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||
split_text=False, flag_size=False, line_close_tol=2,
|
||||
split_text=False, flag_size=False, strip_text='', line_close_tol=2,
|
||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||
iterations=0, resolution=300, **kwargs):
|
||||
self.table_areas = table_areas
|
||||
|
|
@ -84,6 +87,7 @@ class Lattice(BaseParser):
|
|||
self.shift_text = shift_text
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.strip_text = strip_text
|
||||
self.line_close_tol = line_close_tol
|
||||
self.joint_close_tol = joint_close_tol
|
||||
self.threshold_blocksize = threshold_blocksize
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@ class Stream(BaseParser):
|
|||
are comma-separated.
|
||||
split_text : bool, optional (default: False)
|
||||
Split text that spans across multiple cells.
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
flag_size : bool, optional (default: False)
|
||||
Flag text based on font size. Useful to detect
|
||||
super/subscripts. Adds <s></s> around flagged text.
|
||||
|
|
@ -49,13 +52,14 @@ class Stream(BaseParser):
|
|||
|
||||
"""
|
||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||
flag_size=False, edge_close_tol=50, row_close_tol=2,
|
||||
flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2,
|
||||
col_close_tol=0, **kwargs):
|
||||
self.table_areas = table_areas
|
||||
self.columns = columns
|
||||
self._validate_columns()
|
||||
self.split_text = split_text
|
||||
self.flag_size = flag_size
|
||||
self.strip_text = strip_text
|
||||
self.edge_close_tol = edge_close_tol
|
||||
self.row_close_tol = row_close_tol
|
||||
self.col_close_tol = col_close_tol
|
||||
|
|
|
|||
|
|
@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
|
|||
return ret
|
||||
|
||||
|
||||
def flag_font_size(textline, direction):
|
||||
# TODO: combine the following functions into a TextProcessor class which
|
||||
# applies corresponding transformations sequentially
|
||||
# (inspired from sklearn.pipeline.Pipeline)
|
||||
|
||||
|
||||
def flag_font_size(textline, direction, strip_text=''):
|
||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||
May give false positives.
|
||||
|
||||
|
|
@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
|
|||
List of PDFMiner LTChar objects.
|
||||
direction : string
|
||||
Direction of the PDFMiner LTTextLine object.
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
|
|||
fchars = [t[0] for t in chars]
|
||||
if ''.join(fchars).strip():
|
||||
flist.append(''.join(fchars))
|
||||
fstring = ''.join(flist)
|
||||
fstring = ''.join(flist).strip(strip_text)
|
||||
else:
|
||||
fstring = ''.join([t.get_text() for t in textline])
|
||||
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
|
||||
return fstring
|
||||
|
||||
|
||||
def split_textline(table, textline, direction, flag_size=False):
|
||||
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||
multiple rows/columns.
|
||||
|
||||
|
|
@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
|
|||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts.)
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
|
|||
grouped_chars = []
|
||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||
if flag_size:
|
||||
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
|
||||
grouped_chars.append((key[0], key[1],
|
||||
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
|
||||
else:
|
||||
gchars = [t[2].get_text() for t in chars]
|
||||
grouped_chars.append((key[0], key[1], ''.join(gchars)))
|
||||
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
|
||||
return grouped_chars
|
||||
|
||||
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
|
||||
"""Gets indices of the table cell where given text object lies by
|
||||
comparing their y and x-coordinates.
|
||||
|
||||
|
|
@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
|||
Whether or not to highlight a substring using <s></s>
|
||||
if its size is different from rest of the string. (Useful for
|
||||
super and subscripts)
|
||||
strip_text : str, optional (default: '')
|
||||
Characters that should be stripped from a string before
|
||||
assigning it to a cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
|
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
|||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||
|
||||
if split_text:
|
||||
return split_textline(table, t, direction, flag_size=flag_size), error
|
||||
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
|
||||
else:
|
||||
if flag_size:
|
||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
|
||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
|
||||
else:
|
||||
return [(r_idx, c_idx, t.get_text())], error
|
||||
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
||||
|
||||
|
||||
def compute_accuracy(error_weights):
|
||||
|
|
|
|||
Loading…
Reference in New Issue