Add TextEdge and TextEdges helper classes

pull/2/head
Vinayak Mehta 2018-11-22 05:31:02 +05:30
parent cd3aa38f7e
commit 123227aa8c
3 changed files with 134 additions and 5 deletions

View File

@ -1,11 +1,18 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
VERSION = (0, 3, 2) VERSION = (0, 4, 0)
PHASE = 'alpha' # alpha, beta or rc
PHASE_VERSION = '1'
__title__ = 'camelot-py' __title__ = 'camelot-py'
__description__ = 'PDF Table Extraction for Humans.' __description__ = 'PDF Table Extraction for Humans.'
__url__ = 'http://camelot-py.readthedocs.io/' __url__ = 'http://camelot-py.readthedocs.io/'
__version__ = '.'.join(map(str, VERSION)) if PHASE:
__version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
if PHASE_VERSION:
__version__ = '{}.{}'.format(__version__, PHASE_VERSION)
else:
__version__ = '.'.join(map(str, VERSION))
__author__ = 'Vinayak Mehta' __author__ = 'Vinayak Mehta'
__author_email__ = 'vmehta94@gmail.com' __author_email__ = 'vmehta94@gmail.com'
__license__ = 'MIT License' __license__ = 'MIT License'

View File

@ -3,11 +3,104 @@
import os import os
import zipfile import zipfile
import tempfile import tempfile
from itertools import chain
import numpy as np import numpy as np
import pandas as pd import pandas as pd
class TextEdge(object):
def __init__(self, x, y0, y1, align='left'):
self.x = x
self.y0 = y0
self.y1 = y1
self.align = align
self.intersections = 0
self.is_valid = False
def __repr__(self):
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
def update_coords(self, x, y0):
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
self.y0 = y0
self.intersections += 1
# a textedge is valid if it extends uninterrupted over required_elements
if self.intersections > 4:
self.is_valid = True
class TextEdges(object):
def __init__(self):
self._textedges = {'left': [], 'middle': [], 'right': []}
@staticmethod
def get_x_coord(textline, align):
x_left = textline.x0
x_right = textline.x1
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
return x_coord[align]
def add_textedge(self, textline, align):
x = self.get_x_coord(textline, align)
y0 = textline.y0
y1 = textline.y1
te = TextEdge(x, y0, y1, align=align)
self._textedges[align].append(te)
def find_textedge(self, x_coord, align):
for i, te in enumerate(self._textedges[align]):
if np.isclose(te.x, x_coord):
return i
return None
def update_textedges(self, textline):
for align in ['left', 'middle', 'right']:
x_coord = self.get_x_coord(textline, align)
idx = self.find_textedge(x_coord, align)
if idx is None:
print('adding')
self.add_textedge(textline, align)
else:
print('updating')
self._textedges[align][idx].update_coords(x_coord, textline.y0)
def generate_textedges(self, textlines):
textlines_flat = list(chain.from_iterable(textlines))
for tl in textlines_flat:
if len(tl.get_text().strip()) > 1: # TODO: hacky
self.update_textedges(tl)
# # debug
# import matplotlib.pyplot as plt
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['left']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['middle']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
# fig = plt.figure()
# ax = fig.add_subplot(111, aspect='equal')
# for te in self._textedges['right']:
# if te.is_valid:
# ax.plot([te.x, te.x], [te.y0, te.y1])
# plt.show()
def generate_tableareas(self):
return {}
class Cell(object): class Cell(object):
"""Defines a cell in a table with coordinates relative to a """Defines a cell in a table with coordinates relative to a
left-bottom origin. (PDF coordinate space) left-bottom origin. (PDF coordinate space)

View File

@ -9,7 +9,7 @@ import numpy as np
import pandas as pd import pandas as pd
from .base import BaseParser from .base import BaseParser
from ..core import Table from ..core import TextEdges, Table
from ..utils import (text_in_bbox, get_table_index, compute_accuracy, from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
compute_whitespace) compute_whitespace)
@ -116,7 +116,7 @@ class Stream(BaseParser):
row_y = t.y0 row_y = t.y0
temp.append(t) temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0)) rows.append(sorted(temp, key=lambda t: t.x0))
__ = rows.pop(0) # hacky __ = rows.pop(0) # TODO: hacky
return rows return rows
@staticmethod @staticmethod
@ -246,6 +246,34 @@ class Stream(BaseParser):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns"
" should be equal") " should be equal")
def _nurminen_table_detection(self, textlines):
# an general heuristic implementation of the table detection
# algorithm described by Anssi Nurminen's master's thesis:
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
# minimum number of textlines to be considered a textedge
REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
# padding added to table area's lt and rb
TABLE_AREA_PADDING = 10
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
# group textlines into rows
text_grouped = self._group_rows(
self.horizontal_text, row_close_tol=self.row_close_tol)
textedges = TextEdges()
# generate left, middle and right textedges
textedges.generate_textedges(text_grouped)
# select relevant edges
# generate table areas using relevant edges and horizontal text
table_bbox = textedges.generate_tableareas()
# treat whole page as table if not table areas found
if not len(table_bbox):
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox
def _generate_table_bbox(self): def _generate_table_bbox(self):
if self.table_areas is not None: if self.table_areas is not None:
table_bbox = {} table_bbox = {}
@ -257,7 +285,8 @@ class Stream(BaseParser):
y2 = float(y2) y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None table_bbox[(x1, y2, x2, y1)] = None
else: else:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} # find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk): def _generate_columns_and_rows(self, table_idx, tk):