Add strip_text
parent
a38d52c7b2
commit
f6aa21c31f
|
|
@ -43,6 +43,8 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
help='Split text that spans across multiple cells.')
|
help='Split text that spans across multiple cells.')
|
||||||
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
|
||||||
' font size. Useful to detect super/subscripts.')
|
' font size. Useful to detect super/subscripts.')
|
||||||
|
@click.option('-strip', '--strip_text', help='Characters that should be stripped from a string before'
|
||||||
|
' assigning it to a cell.')
|
||||||
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
|
||||||
help='PDFMiner char_margin, line_margin and word_margin.')
|
help='PDFMiner char_margin, line_margin and word_margin.')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,9 @@ class Lattice(BaseParser):
|
||||||
Direction in which text in a spanning cell will flow.
|
Direction in which text in a spanning cell will flow.
|
||||||
split_text : bool, optional (default: False)
|
split_text : bool, optional (default: False)
|
||||||
Split text that spans across multiple cells.
|
Split text that spans across multiple cells.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
|
@ -74,7 +77,7 @@ class Lattice(BaseParser):
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, process_background=False,
|
def __init__(self, table_areas=None, process_background=False,
|
||||||
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
|
||||||
split_text=False, flag_size=False, line_close_tol=2,
|
split_text=False, flag_size=False, strip_text='', line_close_tol=2,
|
||||||
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
|
||||||
iterations=0, resolution=300, **kwargs):
|
iterations=0, resolution=300, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
|
|
@ -84,6 +87,7 @@ class Lattice(BaseParser):
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
|
self.strip_text = strip_text
|
||||||
self.line_close_tol = line_close_tol
|
self.line_close_tol = line_close_tol
|
||||||
self.joint_close_tol = joint_close_tol
|
self.joint_close_tol = joint_close_tol
|
||||||
self.threshold_blocksize = threshold_blocksize
|
self.threshold_blocksize = threshold_blocksize
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,9 @@ class Stream(BaseParser):
|
||||||
are comma-separated.
|
are comma-separated.
|
||||||
split_text : bool, optional (default: False)
|
split_text : bool, optional (default: False)
|
||||||
Split text that spans across multiple cells.
|
Split text that spans across multiple cells.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
flag_size : bool, optional (default: False)
|
flag_size : bool, optional (default: False)
|
||||||
Flag text based on font size. Useful to detect
|
Flag text based on font size. Useful to detect
|
||||||
super/subscripts. Adds <s></s> around flagged text.
|
super/subscripts. Adds <s></s> around flagged text.
|
||||||
|
|
@ -49,13 +52,14 @@ class Stream(BaseParser):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_areas=None, columns=None, split_text=False,
|
def __init__(self, table_areas=None, columns=None, split_text=False,
|
||||||
flag_size=False, edge_close_tol=50, row_close_tol=2,
|
flag_size=False, strip_text='', edge_close_tol=50, row_close_tol=2,
|
||||||
col_close_tol=0, **kwargs):
|
col_close_tol=0, **kwargs):
|
||||||
self.table_areas = table_areas
|
self.table_areas = table_areas
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
self.flag_size = flag_size
|
self.flag_size = flag_size
|
||||||
|
self.strip_text = strip_text
|
||||||
self.edge_close_tol = edge_close_tol
|
self.edge_close_tol = edge_close_tol
|
||||||
self.row_close_tol = row_close_tol
|
self.row_close_tol = row_close_tol
|
||||||
self.col_close_tol = col_close_tol
|
self.col_close_tol = col_close_tol
|
||||||
|
|
|
||||||
|
|
@ -309,7 +309,12 @@ def merge_close_lines(ar, line_close_tol=2):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def flag_font_size(textline, direction):
|
# TODO: combine the following functions into a TextProcessor class which
|
||||||
|
# applies corresponding transformations sequentially
|
||||||
|
# (inspired from sklearn.pipeline.Pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
def flag_font_size(textline, direction, strip_text=''):
|
||||||
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
"""Flags super/subscripts in text by enclosing them with <s></s>.
|
||||||
May give false positives.
|
May give false positives.
|
||||||
|
|
||||||
|
|
@ -319,6 +324,9 @@ def flag_font_size(textline, direction):
|
||||||
List of PDFMiner LTChar objects.
|
List of PDFMiner LTChar objects.
|
||||||
direction : string
|
direction : string
|
||||||
Direction of the PDFMiner LTTextLine object.
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -344,13 +352,13 @@ def flag_font_size(textline, direction):
|
||||||
fchars = [t[0] for t in chars]
|
fchars = [t[0] for t in chars]
|
||||||
if ''.join(fchars).strip():
|
if ''.join(fchars).strip():
|
||||||
flist.append(''.join(fchars))
|
flist.append(''.join(fchars))
|
||||||
fstring = ''.join(flist)
|
fstring = ''.join(flist).strip(strip_text)
|
||||||
else:
|
else:
|
||||||
fstring = ''.join([t.get_text() for t in textline])
|
fstring = ''.join([t.get_text() for t in textline]).strip(strip_text)
|
||||||
return fstring
|
return fstring
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction, flag_size=False):
|
def split_textline(table, textline, direction, flag_size=False, strip_text=''):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
multiple rows/columns.
|
multiple rows/columns.
|
||||||
|
|
||||||
|
|
@ -365,6 +373,9 @@ def split_textline(table, textline, direction, flag_size=False):
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string. (Useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts.)
|
super and subscripts.)
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -416,14 +427,15 @@ def split_textline(table, textline, direction, flag_size=False):
|
||||||
grouped_chars = []
|
grouped_chars = []
|
||||||
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||||
if flag_size:
|
if flag_size:
|
||||||
grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction)))
|
grouped_chars.append((key[0], key[1],
|
||||||
|
flag_font_size([t[2] for t in chars], direction, strip_text=strip_text)))
|
||||||
else:
|
else:
|
||||||
gchars = [t[2].get_text() for t in chars]
|
gchars = [t[2].get_text() for t in chars]
|
||||||
grouped_chars.append((key[0], key[1], ''.join(gchars)))
|
grouped_chars.append((key[0], key[1], ''.join(gchars).strip(strip_text)))
|
||||||
return grouped_chars
|
return grouped_chars
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
def get_table_index(table, t, direction, split_text=False, flag_size=False, strip_text='',):
|
||||||
"""Gets indices of the table cell where given text object lies by
|
"""Gets indices of the table cell where given text object lies by
|
||||||
comparing their y and x-coordinates.
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
|
|
@ -441,6 +453,9 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
Whether or not to highlight a substring using <s></s>
|
Whether or not to highlight a substring using <s></s>
|
||||||
if its size is different from rest of the string. (Useful for
|
if its size is different from rest of the string. (Useful for
|
||||||
super and subscripts)
|
super and subscripts)
|
||||||
|
strip_text : str, optional (default: '')
|
||||||
|
Characters that should be stripped from a string before
|
||||||
|
assigning it to a cell.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
@ -495,12 +510,12 @@ def get_table_index(table, t, direction, split_text=False, flag_size=False):
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return split_textline(table, t, direction, flag_size=flag_size), error
|
return split_textline(table, t, direction, flag_size=flag_size, strip_text=strip_text), error
|
||||||
else:
|
else:
|
||||||
if flag_size:
|
if flag_size:
|
||||||
return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error
|
return [(r_idx, c_idx, flag_font_size(t._objs, direction, strip_text=strip_text))], error
|
||||||
else:
|
else:
|
||||||
return [(r_idx, c_idx, t.get_text())], error
|
return [(r_idx, c_idx, t.get_text().strip(strip_text))], error
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(error_weights):
|
def compute_accuracy(error_weights):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue