Add TextEdge and TextEdges helper classes

pull/2/head
Vinayak Mehta 2018-11-22 05:31:02 +05:30
parent cd3aa38f7e
commit 123227aa8c
3 changed files with 134 additions and 5 deletions

View File

@ -1,10 +1,17 @@
# -*- coding: utf-8 -*-
VERSION = (0, 3, 2)
VERSION = (0, 4, 0)
PHASE = 'alpha' # alpha, beta or rc
PHASE_VERSION = '1'
__title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/'
if PHASE:
__version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
if PHASE_VERSION:
__version__ = '{}.{}'.format(__version__, PHASE_VERSION)
else:
__version__ = '.'.join(map(str, VERSION))
__author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com'

View File

@ -3,11 +3,104 @@
import os
import zipfile
import tempfile
from itertools import chain
import numpy as np
import pandas as pd
class TextEdge(object):
def __init__(self, x, y0, y1, align='left'):
self.x = x
self.y0 = y0
self.y1 = y1
self.align = align
self.intersections = 0
self.is_valid = False
def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
# a textedge is valid if it extends uninterrupted over required_elements
if self.intersections > 4:
self.is_valid = True
class TextEdges(object):
def __init__(self):
self._textedges = {'left': [], 'middle': [], 'right': []}
@staticmethod
def get_x_coord(textline, align):
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
return x_coord[align]
def add_textedge(self, textline, align):
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def find_textedge(self, x_coord, align):
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord):
return i
return None
def update_textedges(self, textline):
for align in ['left', 'middle', 'right']:
x_coord = self.get_x_coord(textline, align)
idx = self.find_textedge(x_coord, align)
if idx is None:
print('adding')
self.add_textedge(textline, align)
else:
print('updating')
self._textedges[align][idx].update_coords(x_coord, textline.y0)
def generate_textedges(self, textlines):
textlines_flat = list(chain.from_iterable(textlines))
for tl in textlines_flat:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update_textedges(tl)
# # debug
# import matplotlib.pyplot as plt
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['left']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['middle']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['right']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
def generate_tableareas(self):
return {}
class Cell(object):
"""Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space)

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd
from .base import BaseParser
from ..core import Table
from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace)
@ -116,7 +116,7 @@ class Stream(BaseParser):
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky
__ = rows.pop(0) # TODO: hacky
return rows
@staticmethod
@ -246,6 +246,34 @@ class Stream(BaseParser):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines):
# an general heuristic implementation of the table detection
# algorithm described by Anssi Nurminen's master's thesis:
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
# minimum number of textlines to be considered a textedge
REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
# padding added to table area's lt and rb
TABLE_AREA_PADDING = 10
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
# group textlines into rows
text_grouped = self._group_rows(
self.horizontal_text, row_close_tol=self.row_close_tol)
textedges = TextEdges()
# generate left, middle and right textedges
textedges.generate_textedges(text_grouped)
# select relevant edges
# generate table areas using relevant edges and horizontal text
table_bbox = textedges.generate_tableareas()
# treat whole page as table if not table areas found
if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox
def _generate_table_bbox(self):
if self.table_areas is not None:
table_bbox = {}
@ -257,7 +285,8 @@ class Stream(BaseParser):
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
else:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk):