Add TextEdge and TextEdges helper classes

2018-11-22 05:31:02 +05:30 · 2018-11-22 05:31:02 +05:30 · 123227aa8c
parent cd3aa38f7e
commit 123227aa8c
3 changed files with 134 additions and 5 deletions
--- a/camelot/version.py
+++ b/camelot/version.py
@ -1,11 +1,18 @@
 # -*- coding: utf-8 -*-
-VERSION = (0, 3, 2)
+VERSION = (0, 4, 0)
 PHASE = 'alpha' # alpha, beta or rc
 PHASE_VERSION = '1'
 __title__ = 'camelot-py'
 __description__ = 'PDF Table Extraction for Humans.'
 __url__ = 'http://camelot-py.readthedocs.io/'
-__version__ = '.'.join(map(str, VERSION))
+if PHASE:
    __version__ = '{}-{}'.format('.'.join(map(str, VERSION)), PHASE)
    if PHASE_VERSION:
        __version__ = '{}.{}'.format(__version__, PHASE_VERSION)
 else:
    __version__ = '.'.join(map(str, VERSION))
 __author__ = 'Vinayak Mehta'
 __author_email__ = 'vmehta94@gmail.com'
 __license__ = 'MIT License'
--- a/camelot/core.py
+++ b/camelot/core.py
@ -3,11 +3,104 @@
 import os
 import zipfile
 import tempfile
 from itertools import chain
 import numpy as np
 import pandas as pd
 class TextEdge(object):
    def __init__(self, x, y0, y1, align='left'):
        self.x = x
        self.y0 = y0
        self.y1 = y1
        self.align = align
        self.intersections = 0
        self.is_valid = False
    def __repr__(self):
        return '<TextEdge x={} y0={} y1={} align={} valid={}>'.format(
            round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid)
    def update_coords(self, x, y0):
        self.x = (self.intersections * self.x + x) / float(self.intersections + 1)
        self.y0 = y0
        self.intersections += 1
        # a textedge is valid if it extends uninterrupted over required_elements
        if self.intersections > 4:
            self.is_valid = True
 class TextEdges(object):
    def __init__(self):
        self._textedges = {'left': [], 'middle': [], 'right': []}
    @staticmethod
    def get_x_coord(textline, align):
        x_left = textline.x0
        x_right = textline.x1
        x_middle = x_left + (x_right - x_left) / 2.0
        x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right}
        return x_coord[align]
    def add_textedge(self, textline, align):
        x = self.get_x_coord(textline, align)
        y0 = textline.y0
        y1 = textline.y1
        te = TextEdge(x, y0, y1, align=align)
        self._textedges[align].append(te)
    def find_textedge(self, x_coord, align):
        for i, te in enumerate(self._textedges[align]):
            if np.isclose(te.x, x_coord):
                return i
        return None
    def update_textedges(self, textline):
        for align in ['left', 'middle', 'right']:
            x_coord = self.get_x_coord(textline, align)
            idx = self.find_textedge(x_coord, align)
            if idx is None:
                print('adding')
                self.add_textedge(textline, align)
            else:
                print('updating')
                self._textedges[align][idx].update_coords(x_coord, textline.y0)
    def generate_textedges(self, textlines):
        textlines_flat = list(chain.from_iterable(textlines))
        for tl in textlines_flat:
            if len(tl.get_text().strip()) > 1: # TODO: hacky
                self.update_textedges(tl)
        # # debug
        # import matplotlib.pyplot as plt
        # fig = plt.figure()
        # ax = fig.add_subplot(111, aspect='equal')
        # for te in self._textedges['left']:
        #     if te.is_valid:
        #         ax.plot([te.x, te.x], [te.y0, te.y1])
        # plt.show()
        # fig = plt.figure()
        # ax = fig.add_subplot(111, aspect='equal')
        # for te in self._textedges['middle']:
        #     if te.is_valid:
        #         ax.plot([te.x, te.x], [te.y0, te.y1])
        # plt.show()
        # fig = plt.figure()
        # ax = fig.add_subplot(111, aspect='equal')
        # for te in self._textedges['right']:
        #     if te.is_valid:
        #         ax.plot([te.x, te.x], [te.y0, te.y1])
        # plt.show()
    def generate_tableareas(self):
        return {}
 class Cell(object):
    """Defines a cell in a table with coordinates relative to a
    left-bottom origin. (PDF coordinate space)
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -9,7 +9,7 @@ import numpy as np
 import pandas as pd
 from .base import BaseParser
-from ..core import Table
+from ..core import TextEdges, Table
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
                     compute_whitespace)
@ -116,7 +116,7 @@ class Stream(BaseParser):
                    row_y = t.y0
                temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
-        __ = rows.pop(0)  # hacky
+        __ = rows.pop(0)  # TODO: hacky
        return rows
    @staticmethod
@ -246,6 +246,34 @@ class Stream(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _nurminen_table_detection(self, textlines):
        # an general heuristic implementation of the table detection
        # algorithm described by Anssi Nurminen's master's thesis:
        # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
        # minimum number of textlines to be considered a textedge
        REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4
        # padding added to table area's lt and rb
        TABLE_AREA_PADDING = 10
        # TODO: add support for arabic text #141
        # sort textlines in reading order
        textlines.sort(key=lambda x: (-x.y0, x.x0))
        # group textlines into rows
        text_grouped = self._group_rows(
                self.horizontal_text, row_close_tol=self.row_close_tol)
        textedges = TextEdges()
        # generate left, middle and right textedges
        textedges.generate_textedges(text_grouped)
        # select relevant edges
        # generate table areas using relevant edges and horizontal text
        table_bbox = textedges.generate_tableareas()
        # treat whole page as table if not table areas found
        if not len(table_bbox):
            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
        return table_bbox
    def _generate_table_bbox(self):
        if self.table_areas is not None:
            table_bbox = {}
@ -257,7 +285,8 @@ class Stream(BaseParser):
                y2 = float(y2)
                table_bbox[(x1, y2, x2, y1)] = None
        else:
-            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
+            # find tables based on nurminen's detection algorithm
            table_bbox = self._nurminen_table_detection(self.horizontal_text)
        self.table_bbox = table_bbox
    def _generate_columns_and_rows(self, table_idx, tk):