Add get_relevant textedges method

pull/2/head
Vinayak Mehta 2018-11-22 18:24:31 +05:30
parent 378408a271
commit a587ea3782
2 changed files with 46 additions and 43 deletions

View File

@ -4,11 +4,20 @@ import os
import zipfile import zipfile
import tempfile import tempfile
from itertools import chain from itertools import chain
from operator import itemgetter
import numpy as np import numpy as np
import pandas as pd import pandas as pd
# minimum number of textlines to be considered a textedge
TEXTEDGE_REQUIRED_ELEMENTS = 4
# y coordinate tolerance for extending text edge
TEXTEDGE_EXTEND_TOLERANCE = 50
# padding added to table area's lt and rb
TABLE_AREA_PADDING = 10
class TextEdge(object): class TextEdge(object):
def __init__(self, x, y0, y1, align='left'): def __init__(self, x, y0, y1, align='left'):
self.x = x self.x = x
@ -23,12 +32,13 @@ class TextEdge(object):
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0): def update_coords(self, x, y0):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1) if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
self.y0 = y0 self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.intersections += 1 self.y0 = y0
# a textedge is valid if it extends uninterrupted over required_elements self.intersections += 1
if self.intersections > 4: # a textedge is valid if it extends uninterrupted over required_elements
self.is_valid = True if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
self.is_valid = True
class TextEdges(object): class TextEdges(object):
@ -43,59 +53,56 @@ class TextEdges(object):
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
return x_coord[align] return x_coord[align]
def add_textedge(self, textline, align): def find(self, x_coord, align):
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord):
return i
return None
def add(self, textline, align):
x = self.get_x_coord(textline, align) x = self.get_x_coord(textline, align)
y0 = textline.y0 y0 = textline.y0
y1 = textline.y1 y1 = textline.y1
te = TextEdge(x, y0, y1, align=align) te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te) self._textedges[align].append(te)
def find_textedge(self, x_coord, align): def update(self, textline):
for i, te in enumerate(self._textedges[align]): for align in ['left', 'right', 'middle']:
if np.isclose(te.x, x_coord):
return i
return None
def update_textedges(self, textline):
for align in ['left', 'middle', 'right']:
x_coord = self.get_x_coord(textline, align) x_coord = self.get_x_coord(textline, align)
idx = self.find_textedge(x_coord, align) idx = self.find(x_coord, align)
if idx is None: if idx is None:
self.add_textedge(textline, align) self.add(textline, align)
else: else:
self._textedges[align][idx].update_coords(x_coord, textline.y0) self._textedges[align][idx].update_coords(x_coord, textline.y0)
def generate_textedges(self, textlines): def generate(self, textlines):
textlines_flat = list(chain.from_iterable(textlines)) textlines_flat = list(chain.from_iterable(textlines))
for tl in textlines_flat: for tl in textlines_flat:
if len(tl.get_text().strip()) > 1: # TODO: hacky if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update_textedges(tl) self.update(tl)
def get_relevant(self):
intersections_sum = {
'left': sum(te.intersections for te in self._textedges['left']),
'right': sum(te.intersections for te in self._textedges['right']),
'middle': sum(te.intersections for te in self._textedges['middle'])
}
# TODO: naive
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return self._textedges[relevant_align]
def get_table_areas(self, relevant_textedges):
# # debug # # debug
# import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
# fig = plt.figure() # fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal') # ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['left']: # for te in relevant_textedges:
# if te.is_valid: # if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1]) # ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show() # plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['middle']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['right']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
def generate_tableareas(self):
return {} return {}

View File

@ -251,11 +251,6 @@ class Stream(BaseParser):
# algorithm described by Anssi Nurminen's master's thesis: # algorithm described by Anssi Nurminen's master's thesis:
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
# minimum number of textlines to be considered a textedge
REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
# padding added to table area's lt and rb
TABLE_AREA_PADDING = 10
# TODO: add support for arabic text #141 # TODO: add support for arabic text #141
# sort textlines in reading order # sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0)) textlines.sort(key=lambda x: (-x.y0, x.x0))
@ -264,10 +259,11 @@ class Stream(BaseParser):
self.horizontal_text, row_close_tol=self.row_close_tol) self.horizontal_text, row_close_tol=self.row_close_tol)
textedges = TextEdges() textedges = TextEdges()
# generate left, middle and right textedges # generate left, middle and right textedges
textedges.generate_textedges(text_grouped) textedges.generate(text_grouped)
# select relevant edges # select relevant edges
# generate table areas using relevant edges and horizontal text relevant_textedges = textedges.get_relevant()
table_bbox = textedges.generate_tableareas() # guess table areas using relevant edges
table_bbox = textedges.get_table_areas(relevant_textedges)
# treat whole page as table if not table areas found # treat whole page as table if not table areas found
if not len(table_bbox): if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}