Add superscript and subscript flagging
* Add superscript flagging * Add flagging param * Add np.round to account for rotation errorpull/2/head
parent
e8b93a9624
commit
40d30c1ab9
|
|
@ -162,6 +162,12 @@ class Lattice:
|
||||||
different cells.
|
different cells.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
|
|
||||||
|
flag_size : bool
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
(optional, default: True)
|
||||||
|
|
||||||
shift_text : list
|
shift_text : list
|
||||||
{'l', 'r', 't', 'b'}
|
{'l', 'r', 't', 'b'}
|
||||||
Select one or more from above and pass them as a list to
|
Select one or more from above and pass them as a list to
|
||||||
|
|
@ -176,17 +182,19 @@ class Lattice:
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
||||||
scale=15, invert=False, margins=(1.0, 0.5, 0.1),
|
scale=15, invert=False, margins=(1.0, 0.5, 0.1),
|
||||||
split_text=False, shift_text=['l', 't'], debug=None):
|
split_text=False, flag_size=True, shift_text=['l', 't'],
|
||||||
|
debug=None):
|
||||||
|
|
||||||
self.method = 'lattice'
|
self.method = 'lattice'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
self.headers = [h.split(',') for h in headers]
|
self.headers = headers
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
self.flag_size = flag_size
|
||||||
self.shift_text = shift_text
|
self.shift_text = shift_text
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
|
|
@ -248,6 +256,7 @@ class Lattice:
|
||||||
if self.headers is not None:
|
if self.headers is not None:
|
||||||
if len(self.table_area) != len(self.headers):
|
if len(self.table_area) != len(self.headers):
|
||||||
raise ValueError("Length of headers should be equal to table_area.")
|
raise ValueError("Length of headers should be equal to table_area.")
|
||||||
|
self.headers = [h.split(',') for h in headers]
|
||||||
|
|
||||||
areas = []
|
areas = []
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
@ -329,13 +338,20 @@ class Lattice:
|
||||||
self.debug_tables.append(table)
|
self.debug_tables.append(table)
|
||||||
|
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
|
table_data['split_text'] = []
|
||||||
|
table_data['superscript'] = []
|
||||||
for direction in t_bbox:
|
for direction in t_bbox:
|
||||||
for t in t_bbox[direction]:
|
for t in t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text)
|
table, t, direction, split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size)
|
||||||
assignment_errors.append(error)
|
assignment_errors.append(error)
|
||||||
indices = _reduce_index(table, indices, shift_text=self.shift_text)
|
indices = _reduce_index(table, indices, shift_text=self.shift_text,)
|
||||||
|
if len(indices) > 1:
|
||||||
|
table_data['split_text'].append(indices)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
|
if all(s in text for s in ['<s>', '</s>']):
|
||||||
|
table_data['superscript'].append((r_idx, c_idx, text))
|
||||||
table.cells[r_idx][c_idx].add_text(text)
|
table.cells[r_idx][c_idx].add_text(text)
|
||||||
score = get_score([[100, assignment_errors]])
|
score = get_score([[100, assignment_errors]])
|
||||||
table_data['score'] = score
|
table_data['score'] = score
|
||||||
|
|
|
||||||
|
|
@ -259,6 +259,12 @@ class Stream:
|
||||||
different cells.
|
different cells.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
|
|
||||||
|
flag_size : bool
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
(optional, default: True)
|
||||||
|
|
||||||
debug : bool
|
debug : bool
|
||||||
Set to True to generate a matplotlib plot of
|
Set to True to generate a matplotlib plot of
|
||||||
LTTextLineHorizontals in order to select table_area, columns.
|
LTTextLineHorizontals in order to select table_area, columns.
|
||||||
|
|
@ -266,7 +272,7 @@ class Stream:
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, ncolumns=None,
|
def __init__(self, table_area=None, columns=None, ncolumns=None,
|
||||||
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
|
headers=None, ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
|
||||||
split_text=False, debug=False):
|
split_text=False, flag_size=True, debug=False):
|
||||||
|
|
||||||
self.method = 'stream'
|
self.method = 'stream'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
|
|
@ -274,9 +280,10 @@ class Stream:
|
||||||
self.ncolumns = ncolumns
|
self.ncolumns = ncolumns
|
||||||
self.ytol = ytol
|
self.ytol = ytol
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
self.headers = [h.split(',') for h in headers]
|
self.headers = headers
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
self.split_text = split_text
|
self.split_text = split_text
|
||||||
|
self.flag_size = flag_size
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
|
|
@ -318,6 +325,7 @@ class Stream:
|
||||||
if self.headers is not None:
|
if self.headers is not None:
|
||||||
if len(self.table_area) != len(self.headers):
|
if len(self.table_area) != len(self.headers):
|
||||||
raise ValueError("Length of headers should be equal to table_area.")
|
raise ValueError("Length of headers should be equal to table_area.")
|
||||||
|
self.headers = [h.split(',') for h in headers]
|
||||||
|
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
for area in self.table_area:
|
for area in self.table_area:
|
||||||
|
|
@ -418,12 +426,19 @@ class Stream:
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
table = table.set_all_edges()
|
table = table.set_all_edges()
|
||||||
assignment_errors = []
|
assignment_errors = []
|
||||||
|
table_data['split_text'] = []
|
||||||
|
table_data['superscript'] = []
|
||||||
for direction in t_bbox:
|
for direction in t_bbox:
|
||||||
for t in t_bbox[direction]:
|
for t in t_bbox[direction]:
|
||||||
indices, error = get_table_index(
|
indices, error = get_table_index(
|
||||||
table, t, direction, split_text=self.split_text)
|
table, t, direction, split_text=self.split_text,
|
||||||
|
flag_size=self.flag_size)
|
||||||
assignment_errors.append(error)
|
assignment_errors.append(error)
|
||||||
|
if len(indices) > 1:
|
||||||
|
table_data['split_text'].append(indices)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
|
if all(s in text for s in ['<s>', '</s>']):
|
||||||
|
table_data['superscript'].append((r_idx, c_idx, text))
|
||||||
table.cells[r_idx][c_idx].add_text(text)
|
table.cells[r_idx][c_idx].add_text(text)
|
||||||
if guess:
|
if guess:
|
||||||
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
|
score = get_score([[66, assignment_errors], [34, [len_non_mode / len(elements)]]])
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
@ -500,7 +502,49 @@ def merge_close_values(ar, mtol=2):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def split_textline(table, textline, direction):
|
def flag_on_size(textline, direction):
|
||||||
|
"""Flags a super/subscript by enclosing it with <s></s>. May give
|
||||||
|
false positives.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
textline : list
|
||||||
|
List of PDFMiner LTChar objects.
|
||||||
|
|
||||||
|
direction : string
|
||||||
|
{'horizontal', 'vertical'}
|
||||||
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
fstring : string
|
||||||
|
"""
|
||||||
|
if direction == 'horizontal':
|
||||||
|
d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||||
|
elif direction == 'vertical':
|
||||||
|
d = [(t.get_text(), np.round(t.width, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
|
||||||
|
l = [np.round(size, decimals=6) for text, size in d]
|
||||||
|
if len(set(l)) > 1:
|
||||||
|
flist = []
|
||||||
|
min_size = min(l)
|
||||||
|
for key, chars in groupby(d, itemgetter(1)):
|
||||||
|
if key == min_size:
|
||||||
|
fchars = [t[0] for t in chars]
|
||||||
|
if ''.join(fchars).strip():
|
||||||
|
fchars.insert(0, '<s>')
|
||||||
|
fchars.append('</s>')
|
||||||
|
flist.append(''.join(fchars))
|
||||||
|
else:
|
||||||
|
fchars = [t[0] for t in chars]
|
||||||
|
if ''.join(fchars).strip():
|
||||||
|
flist.append(''.join(fchars))
|
||||||
|
fstring = ''.join(flist).strip('\n')
|
||||||
|
else:
|
||||||
|
fstring = ''.join([t.get_text() for t in textline]).strip('\n')
|
||||||
|
return fstring
|
||||||
|
|
||||||
|
|
||||||
|
def split_textline(table, textline, direction, flag_size=True):
|
||||||
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
"""Splits PDFMiner LTTextLine into substrings if it spans across
|
||||||
multiple rows/columns.
|
multiple rows/columns.
|
||||||
|
|
||||||
|
|
@ -516,9 +560,15 @@ def split_textline(table, textline, direction):
|
||||||
{'horizontal', 'vertical'}
|
{'horizontal', 'vertical'}
|
||||||
Direction of the PDFMiner LTTextLine object.
|
Direction of the PDFMiner LTTextLine object.
|
||||||
|
|
||||||
|
flag_size : bool
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
(optional, default: True)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cut_text : list
|
grouped_chars : list
|
||||||
List of tuples of the form (idx, text) where idx is the index
|
List of tuples of the form (idx, text) where idx is the index
|
||||||
of row/column and text is the an lttextline substring.
|
of row/column and text is the an lttextline substring.
|
||||||
"""
|
"""
|
||||||
|
|
@ -538,10 +588,10 @@ def split_textline(table, textline, direction):
|
||||||
if isinstance(obj, LTChar):
|
if isinstance(obj, LTChar):
|
||||||
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
|
||||||
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
(obj.x0 + obj.x1) / 2 <= cut[1]):
|
||||||
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
|
cut_text.append((r, cut[0], obj))
|
||||||
break
|
break
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((r, cut[0], obj.get_text().strip('\n')))
|
cut_text.append((r, cut[0], obj))
|
||||||
elif direction == 'vertical' and not textline.is_empty():
|
elif direction == 'vertical' and not textline.is_empty():
|
||||||
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
|
||||||
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
|
||||||
|
|
@ -558,11 +608,18 @@ def split_textline(table, textline, direction):
|
||||||
cut_text.append((cut[0], c, obj.get_text()))
|
cut_text.append((cut[0], c, obj.get_text()))
|
||||||
break
|
break
|
||||||
elif isinstance(obj, LTAnno):
|
elif isinstance(obj, LTAnno):
|
||||||
cut_text.append((cut[0], c, obj.get_text().strip('\n')))
|
cut_text.append((cut[0], c, obj))
|
||||||
return cut_text
|
grouped_chars = []
|
||||||
|
for key, chars in groupby(cut_text, itemgetter(0, 1)):
|
||||||
|
if flag_size:
|
||||||
|
grouped_chars.append((key[0], key[1], flag_on_size([t[2] for t in chars], direction)))
|
||||||
|
else:
|
||||||
|
gchars = [t[2].get_text() for t in chars]
|
||||||
|
grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n')))
|
||||||
|
return grouped_chars
|
||||||
|
|
||||||
|
|
||||||
def get_table_index(table, t, direction, split_text=False):
|
def get_table_index(table, t, direction, split_text=False, flag_size=True):
|
||||||
"""Gets indices of the cell where given text object lies by
|
"""Gets indices of the cell where given text object lies by
|
||||||
comparing their y and x-coordinates.
|
comparing their y and x-coordinates.
|
||||||
|
|
||||||
|
|
@ -583,6 +640,12 @@ def get_table_index(table, t, direction, split_text=False):
|
||||||
multiple cells.
|
multiple cells.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
|
|
||||||
|
flag_size : bool
|
||||||
|
Whether or not to highlight a substring using <s></s>
|
||||||
|
if its size is different from rest of the string, useful for
|
||||||
|
super and subscripts.
|
||||||
|
(optional, default: True)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
indices : list
|
indices : list
|
||||||
|
|
@ -632,7 +695,10 @@ def get_table_index(table, t, direction, split_text=False):
|
||||||
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
error = ((X * (y0_offset + y1_offset)) + (Y * (x0_offset + x1_offset))) / charea
|
||||||
|
|
||||||
if split_text:
|
if split_text:
|
||||||
return split_textline(table, t, direction), error
|
return split_textline(table, t, direction, flag_size=flag_size), error
|
||||||
|
else:
|
||||||
|
if flag_size:
|
||||||
|
return [(r_idx, c_idx, flag_on_size(t._objs, direction))], error
|
||||||
else:
|
else:
|
||||||
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
|
return [(r_idx, c_idx, t.get_text().strip('\n'))], error
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -85,7 +85,7 @@ def test_stream_missing_value():
|
||||||
["4","","","",""]
|
["4","","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, "missing_values.pdf")
|
pdfname = os.path.join(testdir, "missing_values.pdf")
|
||||||
manager = Pdf(Stream(), pdfname, clean=True)
|
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
|
||||||
tables = manager.extract()
|
tables = manager.extract()
|
||||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
|
|
||||||
|
|
@ -210,11 +210,11 @@ def test_stream_table_rotation():
|
||||||
["","","","","","","","","54","","","","","","","","",""]
|
["","","","","","","","","54","","","","","","","","",""]
|
||||||
]
|
]
|
||||||
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
|
pdfname = os.path.join(testdir, "left_rotated_table_2.pdf")
|
||||||
manager = Pdf(Stream(), pdfname, clean=True)
|
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
|
||||||
tables = manager.extract()
|
tables = manager.extract()
|
||||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
|
|
||||||
pdfname = os.path.join(testdir, "right_rotated_table_2.pdf")
|
pdfname = os.path.join(testdir, "right_rotated_table_2.pdf")
|
||||||
manager = Pdf(Stream(), pdfname, clean=True)
|
manager = Pdf(Stream(flag_size=False), pdfname, clean=True)
|
||||||
tables = manager.extract()
|
tables = manager.extract()
|
||||||
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
assert_equal(tables["page-1"]["table-1"]["data"], data)
|
||||||
Loading…
Reference in New Issue