Add get_relevant textedges method
parent
378408a271
commit
a587ea3782
|
|
@ -4,11 +4,20 @@ import os
|
||||||
import zipfile
|
import zipfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
# minimum number of textlines to be considered a textedge
|
||||||
|
TEXTEDGE_REQUIRED_ELEMENTS = 4
|
||||||
|
# y coordinate tolerance for extending text edge
|
||||||
|
TEXTEDGE_EXTEND_TOLERANCE = 50
|
||||||
|
# padding added to table area's lt and rb
|
||||||
|
TABLE_AREA_PADDING = 10
|
||||||
|
|
||||||
|
|
||||||
class TextEdge(object):
|
class TextEdge(object):
|
||||||
def __init__(self, x, y0, y1, align='left'):
|
def __init__(self, x, y0, y1, align='left'):
|
||||||
self.x = x
|
self.x = x
|
||||||
|
|
@ -23,12 +32,13 @@ class TextEdge(object):
|
||||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||||
|
|
||||||
def update_coords(self, x, y0):
|
def update_coords(self, x, y0):
|
||||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE):
|
||||||
self.y0 = y0
|
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||||
self.intersections += 1
|
self.y0 = y0
|
||||||
# a textedge is valid if it extends uninterrupted over required_elements
|
self.intersections += 1
|
||||||
if self.intersections > 4:
|
# a textedge is valid if it extends uninterrupted over required_elements
|
||||||
self.is_valid = True
|
if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
|
self.is_valid = True
|
||||||
|
|
||||||
|
|
||||||
class TextEdges(object):
|
class TextEdges(object):
|
||||||
|
|
@ -43,59 +53,56 @@ class TextEdges(object):
|
||||||
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
|
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
|
||||||
return x_coord[align]
|
return x_coord[align]
|
||||||
|
|
||||||
def add_textedge(self, textline, align):
|
def find(self, x_coord, align):
|
||||||
|
for i, te in enumerate(self._textedges[align]):
|
||||||
|
if np.isclose(te.x, x_coord):
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
def add(self, textline, align):
|
||||||
x = self.get_x_coord(textline, align)
|
x = self.get_x_coord(textline, align)
|
||||||
y0 = textline.y0
|
y0 = textline.y0
|
||||||
y1 = textline.y1
|
y1 = textline.y1
|
||||||
te = TextEdge(x, y0, y1, align=align)
|
te = TextEdge(x, y0, y1, align=align)
|
||||||
self._textedges[align].append(te)
|
self._textedges[align].append(te)
|
||||||
|
|
||||||
def find_textedge(self, x_coord, align):
|
def update(self, textline):
|
||||||
for i, te in enumerate(self._textedges[align]):
|
for align in ['left', 'right', 'middle']:
|
||||||
if np.isclose(te.x, x_coord):
|
|
||||||
return i
|
|
||||||
return None
|
|
||||||
|
|
||||||
def update_textedges(self, textline):
|
|
||||||
for align in ['left', 'middle', 'right']:
|
|
||||||
x_coord = self.get_x_coord(textline, align)
|
x_coord = self.get_x_coord(textline, align)
|
||||||
idx = self.find_textedge(x_coord, align)
|
idx = self.find(x_coord, align)
|
||||||
if idx is None:
|
if idx is None:
|
||||||
self.add_textedge(textline, align)
|
self.add(textline, align)
|
||||||
else:
|
else:
|
||||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
||||||
|
|
||||||
def generate_textedges(self, textlines):
|
def generate(self, textlines):
|
||||||
textlines_flat = list(chain.from_iterable(textlines))
|
textlines_flat = list(chain.from_iterable(textlines))
|
||||||
for tl in textlines_flat:
|
for tl in textlines_flat:
|
||||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||||
self.update_textedges(tl)
|
self.update(tl)
|
||||||
|
|
||||||
|
def get_relevant(self):
|
||||||
|
intersections_sum = {
|
||||||
|
'left': sum(te.intersections for te in self._textedges['left']),
|
||||||
|
'right': sum(te.intersections for te in self._textedges['right']),
|
||||||
|
'middle': sum(te.intersections for te in self._textedges['middle'])
|
||||||
|
}
|
||||||
|
|
||||||
|
# TODO: naive
|
||||||
|
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||||
|
return self._textedges[relevant_align]
|
||||||
|
|
||||||
|
def get_table_areas(self, relevant_textedges):
|
||||||
# # debug
|
# # debug
|
||||||
# import matplotlib.pyplot as plt
|
# import matplotlib.pyplot as plt
|
||||||
|
|
||||||
# fig = plt.figure()
|
# fig = plt.figure()
|
||||||
# ax = fig.add_subplot(111, aspect='equal')
|
# ax = fig.add_subplot(111, aspect='equal')
|
||||||
# for te in self._textedges['left']:
|
# for te in relevant_textedges:
|
||||||
# if te.is_valid:
|
# if te.is_valid:
|
||||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||||
# plt.show()
|
# plt.show()
|
||||||
|
|
||||||
# fig = plt.figure()
|
|
||||||
# ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
# for te in self._textedges['middle']:
|
|
||||||
# if te.is_valid:
|
|
||||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
|
||||||
# plt.show()
|
|
||||||
|
|
||||||
# fig = plt.figure()
|
|
||||||
# ax = fig.add_subplot(111, aspect='equal')
|
|
||||||
# for te in self._textedges['right']:
|
|
||||||
# if te.is_valid:
|
|
||||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
|
||||||
# plt.show()
|
|
||||||
|
|
||||||
def generate_tableareas(self):
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -251,11 +251,6 @@ class Stream(BaseParser):
|
||||||
# algorithm described by Anssi Nurminen's master's thesis:
|
# algorithm described by Anssi Nurminen's master's thesis:
|
||||||
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
||||||
|
|
||||||
# minimum number of textlines to be considered a textedge
|
|
||||||
REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
|
|
||||||
# padding added to table area's lt and rb
|
|
||||||
TABLE_AREA_PADDING = 10
|
|
||||||
|
|
||||||
# TODO: add support for arabic text #141
|
# TODO: add support for arabic text #141
|
||||||
# sort textlines in reading order
|
# sort textlines in reading order
|
||||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||||
|
|
@ -264,10 +259,11 @@ class Stream(BaseParser):
|
||||||
self.horizontal_text, row_close_tol=self.row_close_tol)
|
self.horizontal_text, row_close_tol=self.row_close_tol)
|
||||||
textedges = TextEdges()
|
textedges = TextEdges()
|
||||||
# generate left, middle and right textedges
|
# generate left, middle and right textedges
|
||||||
textedges.generate_textedges(text_grouped)
|
textedges.generate(text_grouped)
|
||||||
# select relevant edges
|
# select relevant edges
|
||||||
# generate table areas using relevant edges and horizontal text
|
relevant_textedges = textedges.get_relevant()
|
||||||
table_bbox = textedges.generate_tableareas()
|
# guess table areas using relevant edges
|
||||||
|
table_bbox = textedges.get_table_areas(relevant_textedges)
|
||||||
# treat whole page as table if not table areas found
|
# treat whole page as table if not table areas found
|
||||||
if not len(table_bbox):
|
if not len(table_bbox):
|
||||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue