From 123227aa8c11238ae94c72ba0506f2d476d0ac57 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 05:31:02 +0530 Subject: [PATCH] Add TextEdge and TextEdges helper classes --- camelot/__version__.py | 11 ++++- camelot/core.py | 93 +++++++++++++++++++++++++++++++++++++++ camelot/parsers/stream.py | 35 +++++++++++++-- 3 files changed, 134 insertions(+), 5 deletions(-) diff --git a/camelot/__version__.py b/camelot/__version__.py index 22adbc4..f19ff5e 100644 --- a/camelot/__version__.py +++ b/camelot/__version__.py @@ -1,11 +1,18 @@ # -*- coding: utf-8 -*- -VERSION = (0, 3, 2) +VERSION = (0, 4, 0) +PHASE = 'alpha' # alpha, beta or rc +PHASE_VERSION = '1' __title__ = 'camelot-py' __description__ = 'PDF Table Extraction for Humans.' __url__ = 'http://camelot-py.readthedocs.io/' -__version__ = '.'.join(map(str, VERSION)) +if PHASE: + __version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE) + if PHASE_VERSION: + __version__ = '{}.{}'.format(__version__, PHASE_VERSION) +else: + __version__ = '.'.join(map(str, VERSION)) __author__ = 'Vinayak Mehta' __author_email__ = 'vmehta94@gmail.com' __license__ = 'MIT License' diff --git a/camelot/core.py b/camelot/core.py index 45b316b..66d1c28 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -3,11 +3,104 @@ import os import zipfile import tempfile +from itertools import chain import numpy as np import pandas as pd +class TextEdge(object): + def __init__(self, x, y0, y1, align='left'): + self.x = x + self.y0 = y0 + self.y1 = y1 + self.align = align + self.intersections = 0 + self.is_valid = False + + def __repr__(self): + return ''.format( + round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) + + def update_coords(self, x, y0): + self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.y0 = y0 + self.intersections += 1 + # a textedge is valid if it extends uninterrupted over required_elements + if self.intersections > 4: + self.is_valid = True + + +class TextEdges(object): + def __init__(self): + self._textedges = {'left': [], 'middle': [], 'right': []} + + @staticmethod + def get_x_coord(textline, align): + x_left = textline.x0 + x_right = textline.x1 + x_middle = x_left + (x_right - x_left) / 2.0 + x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} + return x_coord[align] + + def add_textedge(self, textline, align): + x = self.get_x_coord(textline, align) + y0 = textline.y0 + y1 = textline.y1 + te = TextEdge(x, y0, y1, align=align) + self._textedges[align].append(te) + + def find_textedge(self, x_coord, align): + for i, te in enumerate(self._textedges[align]): + if np.isclose(te.x, x_coord): + return i + return None + + def update_textedges(self, textline): + for align in ['left', 'middle', 'right']: + x_coord = self.get_x_coord(textline, align) + idx = self.find_textedge(x_coord, align) + if idx is None: + print('adding') + self.add_textedge(textline, align) + else: + print('updating') + self._textedges[align][idx].update_coords(x_coord, textline.y0) + + def generate_textedges(self, textlines): + textlines_flat = list(chain.from_iterable(textlines)) + for tl in textlines_flat: + if len(tl.get_text().strip()) > 1: # TODO: hacky + self.update_textedges(tl) + + # # debug + # import matplotlib.pyplot as plt + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['left']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['middle']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + # fig = plt.figure() + # ax = fig.add_subplot(111, aspect='equal') + # for te in self._textedges['right']: + # if te.is_valid: + # ax.plot([te.x, te.x], [te.y0, te.y1]) + # plt.show() + + def generate_tableareas(self): + return {} + + class Cell(object): """Defines a cell in a table with coordinates relative to a left-bottom origin. (PDF coordinate space) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 709f01d..55ef7ca 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd from .base import BaseParser -from ..core import Table +from ..core import TextEdges, Table from ..utils import (text_in_bbox, get_table_index, compute_accuracy, compute_whitespace) @@ -116,7 +116,7 @@ class Stream(BaseParser): row_y = t.y0 temp.append(t) rows.append(sorted(temp, key=lambda t: t.x0)) - __ = rows.pop(0) # hacky + __ = rows.pop(0) # TODO: hacky return rows @staticmethod @@ -246,6 +246,34 @@ class Stream(BaseParser): raise ValueError("Length of table_areas and columns" " should be equal") + def _nurminen_table_detection(self, textlines): + # an general heuristic implementation of the table detection + # algorithm described by Anssi Nurminen's master's thesis: + # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + + # minimum number of textlines to be considered a textedge + REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4 + # padding added to table area's lt and rb + TABLE_AREA_PADDING = 10 + + # TODO: add support for arabic text #141 + # sort textlines in reading order + textlines.sort(key=lambda x: (-x.y0, x.x0)) + # group textlines into rows + text_grouped = self._group_rows( + self.horizontal_text, row_close_tol=self.row_close_tol) + textedges = TextEdges() + # generate left, middle and right textedges + textedges.generate_textedges(text_grouped) + # select relevant edges + # generate table areas using relevant edges and horizontal text + table_bbox = textedges.generate_tableareas() + # treat whole page as table if not table areas found + if not len(table_bbox): + table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + + return table_bbox + def _generate_table_bbox(self): if self.table_areas is not None: table_bbox = {} @@ -257,7 +285,8 @@ class Stream(BaseParser): y2 = float(y2) table_bbox[(x1, y2, x2, y1)] = None else: - table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None} + # find tables based on nurminen's detection algorithm + table_bbox = self._nurminen_table_detection(self.horizontal_text) self.table_bbox = table_bbox def _generate_columns_and_rows(self, table_idx, tk):