Fix vertical text detection in cells
* Fix vertical text detection in cells * Add Cell instance method * Change var namespull/2/head
parent
8d56f15130
commit
b2dd5f68fe
|
|
@ -54,6 +54,7 @@ class Cell:
|
||||||
self.top = False
|
self.top = False
|
||||||
self.bottom = False
|
self.bottom = False
|
||||||
self.text = ''
|
self.text = ''
|
||||||
|
self.text_objects = []
|
||||||
self.spanning_h = False
|
self.spanning_h = False
|
||||||
self.spanning_v = False
|
self.spanning_v = False
|
||||||
|
|
||||||
|
|
@ -75,6 +76,12 @@ class Cell:
|
||||||
"""
|
"""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
def add_object(self, t_object):
|
||||||
|
self.text_objects.append(t_object)
|
||||||
|
|
||||||
|
def get_objects(self):
|
||||||
|
return self.text_objects
|
||||||
|
|
||||||
def get_bounded_edges(self):
|
def get_bounded_edges(self):
|
||||||
"""Returns number of edges by which a cell is bounded.
|
"""Returns number of edges by which a cell is bounded.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import numpy as np
|
||||||
from wand.image import Image
|
from wand.image import Image
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .utils import (transform, elements_bbox, detect_vertical, merge_close_values,
|
from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values,
|
||||||
get_row_index, get_column_index, get_score, reduce_index,
|
get_row_index, get_column_index, get_score, reduce_index,
|
||||||
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
|
outline, fill_spanning, count_empty, encode_list, pdf_to_text)
|
||||||
|
|
||||||
|
|
@ -247,10 +247,10 @@ class Lattice:
|
||||||
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
|
||||||
# select edges which lie within table_bbox
|
# select edges which lie within table_bbox
|
||||||
table_info = {}
|
table_info = {}
|
||||||
text_bbox, v_s, h_s = elements_bbox(k, text, v_segments,
|
v_s, h_s = segments_bbox(k, v_segments, h_segments)
|
||||||
h_segments)
|
t_bbox = text_bbox(k, text)
|
||||||
table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text)))
|
table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
|
||||||
rotated = detect_vertical(text_bbox)
|
table_rotation = detect_vertical(t_bbox)
|
||||||
cols, rows = zip(*table_bbox[k])
|
cols, rows = zip(*table_bbox[k])
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
cols.extend([k[0], k[2]])
|
cols.extend([k[0], k[2]])
|
||||||
|
|
@ -277,17 +277,9 @@ class Lattice:
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_tables.append(table)
|
self.debug_tables.append(table)
|
||||||
|
|
||||||
# fill text after sorting it
|
|
||||||
if rotated == '':
|
|
||||||
text_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
|
||||||
elif rotated == 'left':
|
|
||||||
text_bbox.sort(key=lambda x: (x.x0, x.y0))
|
|
||||||
elif rotated == 'right':
|
|
||||||
text_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
|
||||||
|
|
||||||
rerror = []
|
rerror = []
|
||||||
cerror = []
|
cerror = []
|
||||||
for t in text_bbox:
|
for t in text:
|
||||||
try:
|
try:
|
||||||
r_idx, rass_error = get_row_index(t, rows)
|
r_idx, rass_error = get_row_index(t, rows)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
|
|
@ -300,19 +292,36 @@ class Lattice:
|
||||||
continue
|
continue
|
||||||
rerror.append(rass_error)
|
rerror.append(rass_error)
|
||||||
cerror.append(cass_error)
|
cerror.append(cass_error)
|
||||||
r_idx, c_idx = reduce_index(
|
r_idx, c_idx = reduce_index(table, table_rotation, r_idx, c_idx)
|
||||||
table, rotated, r_idx, c_idx)
|
table.cells[r_idx][c_idx].add_object(t)
|
||||||
table.cells[r_idx][c_idx].add_text(
|
|
||||||
t.get_text().strip('\n'))
|
for i in range(len(table.cells)):
|
||||||
|
for j in range(len(table.cells[i])):
|
||||||
|
t_bbox = table.cells[i][j].get_objects()
|
||||||
|
try:
|
||||||
|
cell_rotation = detect_vertical(t_bbox)
|
||||||
|
except ZeroDivisionError:
|
||||||
|
cell_rotation = ''
|
||||||
|
pass
|
||||||
|
# fill text after sorting it
|
||||||
|
if cell_rotation == '':
|
||||||
|
t_bbox.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
|
elif cell_rotation == 'left':
|
||||||
|
t_bbox.sort(key=lambda x: (x.x0, x.y0))
|
||||||
|
elif cell_rotation == 'right':
|
||||||
|
t_bbox.sort(key=lambda x: (-x.x0, -x.y0))
|
||||||
|
table.cells[i][j].add_text(''.join([t.get_text()
|
||||||
|
for t in t_bbox]))
|
||||||
|
|
||||||
score = get_score([[50, rerror], [50, cerror]])
|
score = get_score([[50, rerror], [50, cerror]])
|
||||||
table_info['score'] = score
|
table_info['score'] = score
|
||||||
|
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
table = fill_spanning(table, fill=self.fill)
|
table = fill_spanning(table, fill=self.fill)
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
if rotated == 'left':
|
if table_rotation == 'left':
|
||||||
ar = zip(*ar[::-1])
|
ar = zip(*ar[::-1])
|
||||||
elif rotated == 'right':
|
elif table_rotation == 'right':
|
||||||
ar = zip(*ar[::1])
|
ar = zip(*ar[::1])
|
||||||
ar.reverse()
|
ar.reverse()
|
||||||
ar = encode_list(ar)
|
ar = encode_list(ar)
|
||||||
|
|
|
||||||
|
|
@ -144,20 +144,20 @@ def detect_vertical(text):
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
rotated : string
|
rotation : string
|
||||||
"""
|
"""
|
||||||
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
|
||||||
num_h = [t for t in text if t.upright and t.get_text().strip()]
|
num_h = [t for t in text if t.upright and t.get_text().strip()]
|
||||||
vger = len(num_v) / float(len(num_v) + len(num_h))
|
vger = len(num_v) / float(len(num_v) + len(num_h))
|
||||||
rotated = ''
|
rotation = ''
|
||||||
if vger > 0.8:
|
if vger > 0.8:
|
||||||
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
|
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
|
||||||
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
|
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
|
||||||
rotated = 'left' if clockwise < anticlockwise else 'right'
|
rotation = 'left' if clockwise < anticlockwise else 'right'
|
||||||
return rotated
|
return rotation
|
||||||
|
|
||||||
|
|
||||||
def elements_bbox(bbox, text, v_segments, h_segments):
|
def segments_bbox(bbox, v_segments, h_segments):
|
||||||
"""Returns all text objects and line segments present inside a
|
"""Returns all text objects and line segments present inside a
|
||||||
table's bounding box.
|
table's bounding box.
|
||||||
|
|
||||||
|
|
@ -181,14 +181,20 @@ def elements_bbox(bbox, text, v_segments, h_segments):
|
||||||
"""
|
"""
|
||||||
lb = (bbox[0], bbox[1])
|
lb = (bbox[0], bbox[1])
|
||||||
rt = (bbox[2], bbox[3])
|
rt = (bbox[2], bbox[3])
|
||||||
text_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
|
||||||
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
|
||||||
<= rt[1] + 2]
|
|
||||||
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
|
||||||
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
|
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
|
||||||
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
|
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
|
||||||
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
||||||
return text_bbox, v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
|
def text_bbox(bbox, text):
|
||||||
|
lb = (bbox[0], bbox[1])
|
||||||
|
rt = (bbox[2], bbox[3])
|
||||||
|
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
|
||||||
|
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
|
||||||
|
<= rt[1] + 2]
|
||||||
|
return t_bbox
|
||||||
|
|
||||||
|
|
||||||
def remove_close_values(ar, mtol=2):
|
def remove_close_values(ar, mtol=2):
|
||||||
|
|
@ -331,7 +337,7 @@ def get_score(error_weights):
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def reduce_index(t, rotated, r_idx, c_idx):
|
def reduce_index(t, rotation, r_idx, c_idx):
|
||||||
"""Reduces index of a text object if it lies within a spanning
|
"""Reduces index of a text object if it lies within a spanning
|
||||||
cell taking in account table rotation.
|
cell taking in account table rotation.
|
||||||
|
|
||||||
|
|
@ -339,7 +345,7 @@ def reduce_index(t, rotated, r_idx, c_idx):
|
||||||
----------
|
----------
|
||||||
t : object
|
t : object
|
||||||
|
|
||||||
rotated : string
|
rotation : string
|
||||||
|
|
||||||
r_idx : int
|
r_idx : int
|
||||||
|
|
||||||
|
|
@ -351,21 +357,21 @@ def reduce_index(t, rotated, r_idx, c_idx):
|
||||||
|
|
||||||
c_idx : int
|
c_idx : int
|
||||||
"""
|
"""
|
||||||
if not rotated:
|
if not rotation:
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
while not t.cells[r_idx][c_idx].left:
|
||||||
c_idx -= 1
|
c_idx -= 1
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
while not t.cells[r_idx][c_idx].top:
|
while not t.cells[r_idx][c_idx].top:
|
||||||
r_idx -= 1
|
r_idx -= 1
|
||||||
elif rotated == 'left':
|
elif rotation == 'left':
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
while not t.cells[r_idx][c_idx].left:
|
||||||
c_idx -= 1
|
c_idx -= 1
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
if t.cells[r_idx][c_idx].spanning_v:
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
while not t.cells[r_idx][c_idx].bottom:
|
||||||
r_idx += 1
|
r_idx += 1
|
||||||
elif rotated == 'right':
|
elif rotation == 'right':
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].spanning_h:
|
||||||
while not t.cells[r_idx][c_idx].right:
|
while not t.cells[r_idx][c_idx].right:
|
||||||
c_idx += 1
|
c_idx += 1
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue