Add TextEdge and TextEdges helper classes
parent
cd3aa38f7e
commit
123227aa8c
|
|
@ -1,10 +1,17 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
VERSION = (0, 3, 2)
|
||||
VERSION = (0, 4, 0)
|
||||
PHASE = 'alpha' # alpha, beta or rc
|
||||
PHASE_VERSION = '1'
|
||||
|
||||
__title__ = 'camelot-py'
|
||||
__description__ = 'PDF Table Extraction for Humans.'
|
||||
__url__ = 'http://camelot-py.readthedocs.io/'
|
||||
if PHASE:
|
||||
__version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
|
||||
if PHASE_VERSION:
|
||||
__version__ = '{}.{}'.format(__version__, PHASE_VERSION)
|
||||
else:
|
||||
__version__ = '.'.join(map(str, VERSION))
|
||||
__author__ = 'Vinayak Mehta'
|
||||
__author_email__ = 'vmehta94@gmail.com'
|
||||
|
|
|
|||
|
|
@ -3,11 +3,104 @@
|
|||
import os
|
||||
import zipfile
|
||||
import tempfile
|
||||
from itertools import chain
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TextEdge(object):
|
||||
def __init__(self, x, y0, y1, align='left'):
|
||||
self.x = x
|
||||
self.y0 = y0
|
||||
self.y1 = y1
|
||||
self.align = align
|
||||
self.intersections = 0
|
||||
self.is_valid = False
|
||||
|
||||
def __repr__(self):
|
||||
return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
|
||||
round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
|
||||
|
||||
def update_coords(self, x, y0):
|
||||
self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
|
||||
self.y0 = y0
|
||||
self.intersections += 1
|
||||
# a textedge is valid if it extends uninterrupted over required_elements
|
||||
if self.intersections > 4:
|
||||
self.is_valid = True
|
||||
|
||||
|
||||
class TextEdges(object):
|
||||
def __init__(self):
|
||||
self._textedges = {'left': [], 'middle': [], 'right': []}
|
||||
|
||||
@staticmethod
|
||||
def get_x_coord(textline, align):
|
||||
x_left = textline.x0
|
||||
x_right = textline.x1
|
||||
x_middle = x_left + (x_right - x_left) / 2.0
|
||||
x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
|
||||
return x_coord[align]
|
||||
|
||||
def add_textedge(self, textline, align):
|
||||
x = self.get_x_coord(textline, align)
|
||||
y0 = textline.y0
|
||||
y1 = textline.y1
|
||||
te = TextEdge(x, y0, y1, align=align)
|
||||
self._textedges[align].append(te)
|
||||
|
||||
def find_textedge(self, x_coord, align):
|
||||
for i, te in enumerate(self._textedges[align]):
|
||||
if np.isclose(te.x, x_coord):
|
||||
return i
|
||||
return None
|
||||
|
||||
def update_textedges(self, textline):
|
||||
for align in ['left', 'middle', 'right']:
|
||||
x_coord = self.get_x_coord(textline, align)
|
||||
idx = self.find_textedge(x_coord, align)
|
||||
if idx is None:
|
||||
print('adding')
|
||||
self.add_textedge(textline, align)
|
||||
else:
|
||||
print('updating')
|
||||
self._textedges[align][idx].update_coords(x_coord, textline.y0)
|
||||
|
||||
def generate_textedges(self, textlines):
|
||||
textlines_flat = list(chain.from_iterable(textlines))
|
||||
for tl in textlines_flat:
|
||||
if len(tl.get_text().strip()) > 1: # TODO: hacky
|
||||
self.update_textedges(tl)
|
||||
|
||||
# # debug
|
||||
# import matplotlib.pyplot as plt
|
||||
|
||||
# fig = plt.figure()
|
||||
# ax = fig.add_subplot(111, aspect='equal')
|
||||
# for te in self._textedges['left']:
|
||||
# if te.is_valid:
|
||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||
# plt.show()
|
||||
|
||||
# fig = plt.figure()
|
||||
# ax = fig.add_subplot(111, aspect='equal')
|
||||
# for te in self._textedges['middle']:
|
||||
# if te.is_valid:
|
||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||
# plt.show()
|
||||
|
||||
# fig = plt.figure()
|
||||
# ax = fig.add_subplot(111, aspect='equal')
|
||||
# for te in self._textedges['right']:
|
||||
# if te.is_valid:
|
||||
# ax.plot([te.x, te.x], [te.y0, te.y1])
|
||||
# plt.show()
|
||||
|
||||
def generate_tableareas(self):
|
||||
return {}
|
||||
|
||||
|
||||
class Cell(object):
|
||||
"""Defines a cell in a table with coordinates relative to a
|
||||
left-bottom origin. (PDF coordinate space)
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
|
||||
from .base import BaseParser
|
||||
from ..core import Table
|
||||
from ..core import TextEdges, Table
|
||||
from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
|
||||
compute_whitespace)
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ class Stream(BaseParser):
|
|||
row_y = t.y0
|
||||
temp.append(t)
|
||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||
__ = rows.pop(0) # hacky
|
||||
__ = rows.pop(0) # TODO: hacky
|
||||
return rows
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -246,6 +246,34 @@ class Stream(BaseParser):
|
|||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
def _nurminen_table_detection(self, textlines):
|
||||
# an general heuristic implementation of the table detection
|
||||
# algorithm described by Anssi Nurminen's master's thesis:
|
||||
# https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
|
||||
|
||||
# minimum number of textlines to be considered a textedge
|
||||
REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
|
||||
# padding added to table area's lt and rb
|
||||
TABLE_AREA_PADDING = 10
|
||||
|
||||
# TODO: add support for arabic text #141
|
||||
# sort textlines in reading order
|
||||
textlines.sort(key=lambda x: (-x.y0, x.x0))
|
||||
# group textlines into rows
|
||||
text_grouped = self._group_rows(
|
||||
self.horizontal_text, row_close_tol=self.row_close_tol)
|
||||
textedges = TextEdges()
|
||||
# generate left, middle and right textedges
|
||||
textedges.generate_textedges(text_grouped)
|
||||
# select relevant edges
|
||||
# generate table areas using relevant edges and horizontal text
|
||||
table_bbox = textedges.generate_tableareas()
|
||||
# treat whole page as table if not table areas found
|
||||
if not len(table_bbox):
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
|
||||
return table_bbox
|
||||
|
||||
def _generate_table_bbox(self):
|
||||
if self.table_areas is not None:
|
||||
table_bbox = {}
|
||||
|
|
@ -257,7 +285,8 @@ class Stream(BaseParser):
|
|||
y2 = float(y2)
|
||||
table_bbox[(x1, y2, x2, y1)] = None
|
||||
else:
|
||||
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
|
||||
# find tables based on nurminen's detection algorithm
|
||||
table_bbox = self._nurminen_table_detection(self.horizontal_text)
|
||||
self.table_bbox = table_bbox
|
||||
|
||||
def _generate_columns_and_rows(self, table_idx, tk):
|
||||
|
|
|
|||
Loading…
Reference in New Issue