From a587ea3782a84b348fc455d3f8f0a3371f1b77e0 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 22 Nov 2018 18:24:31 +0530 Subject: [PATCH] Add get_relevant textedges method --- camelot/core.py | 77 +++++++++++++++++++++------------------ camelot/parsers/stream.py | 12 ++---- 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index f50f77b..9a9882d 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -4,11 +4,20 @@ import os import zipfile import tempfile from itertools import chain +from operator import itemgetter import numpy as np import pandas as pd +# minimum number of textlines to be considered a textedge +TEXTEDGE_REQUIRED_ELEMENTS = 4 +# y coordinate tolerance for extending text edge +TEXTEDGE_EXTEND_TOLERANCE = 50 +# padding added to table area's lt and rb +TABLE_AREA_PADDING = 10 + + class TextEdge(object): def __init__(self, x, y0, y1, align='left'): self.x = x @@ -23,12 +32,13 @@ class TextEdge(object): round(self.x, 2), round(self.y0, 2), round(self.y1, 2), self.align, self.is_valid) def update_coords(self, x, y0): - self.x = (self.intersections * self.x + x) / float(self.intersections + 1) - self.y0 = y0 - self.intersections += 1 - # a textedge is valid if it extends uninterrupted over required_elements - if self.intersections > 4: - self.is_valid = True + if np.isclose(self.y0, y0, atol=TEXTEDGE_EXTEND_TOLERANCE): + self.x = (self.intersections * self.x + x) / float(self.intersections + 1) + self.y0 = y0 + self.intersections += 1 + # a textedge is valid if it extends uninterrupted over required_elements + if self.intersections > TEXTEDGE_REQUIRED_ELEMENTS: + self.is_valid = True class TextEdges(object): @@ -43,59 +53,56 @@ class TextEdges(object): x_coord = {'left': x_left, 'middle': x_middle, 'right': x_right} return x_coord[align] - def add_textedge(self, textline, align): + def find(self, x_coord, align): + for i, te in enumerate(self._textedges[align]): + if np.isclose(te.x, x_coord): + return i + return None + + def add(self, textline, align): x = self.get_x_coord(textline, align) y0 = textline.y0 y1 = textline.y1 te = TextEdge(x, y0, y1, align=align) self._textedges[align].append(te) - def find_textedge(self, x_coord, align): - for i, te in enumerate(self._textedges[align]): - if np.isclose(te.x, x_coord): - return i - return None - - def update_textedges(self, textline): - for align in ['left', 'middle', 'right']: + def update(self, textline): + for align in ['left', 'right', 'middle']: x_coord = self.get_x_coord(textline, align) - idx = self.find_textedge(x_coord, align) + idx = self.find(x_coord, align) if idx is None: - self.add_textedge(textline, align) + self.add(textline, align) else: self._textedges[align][idx].update_coords(x_coord, textline.y0) - def generate_textedges(self, textlines): + def generate(self, textlines): textlines_flat = list(chain.from_iterable(textlines)) for tl in textlines_flat: if len(tl.get_text().strip()) > 1: # TODO: hacky - self.update_textedges(tl) + self.update(tl) + def get_relevant(self): + intersections_sum = { + 'left': sum(te.intersections for te in self._textedges['left']), + 'right': sum(te.intersections for te in self._textedges['right']), + 'middle': sum(te.intersections for te in self._textedges['middle']) + } + + # TODO: naive + relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] + return self._textedges[relevant_align] + + def get_table_areas(self, relevant_textedges): # # debug # import matplotlib.pyplot as plt # fig = plt.figure() # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['left']: + # for te in relevant_textedges: # if te.is_valid: # ax.plot([te.x, te.x], [te.y0, te.y1]) # plt.show() - # fig = plt.figure() - # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['middle']: - # if te.is_valid: - # ax.plot([te.x, te.x], [te.y0, te.y1]) - # plt.show() - - # fig = plt.figure() - # ax = fig.add_subplot(111, aspect='equal') - # for te in self._textedges['right']: - # if te.is_valid: - # ax.plot([te.x, te.x], [te.y0, te.y1]) - # plt.show() - - def generate_tableareas(self): return {} diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 55ef7ca..982b5f6 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -251,11 +251,6 @@ class Stream(BaseParser): # algorithm described by Anssi Nurminen's master's thesis: # https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 - # minimum number of textlines to be considered a textedge - REQUIRED_ELEMENTS_FOR_TEXTEDGE = 4 - # padding added to table area's lt and rb - TABLE_AREA_PADDING = 10 - # TODO: add support for arabic text #141 # sort textlines in reading order textlines.sort(key=lambda x: (-x.y0, x.x0)) @@ -264,10 +259,11 @@ class Stream(BaseParser): self.horizontal_text, row_close_tol=self.row_close_tol) textedges = TextEdges() # generate left, middle and right textedges - textedges.generate_textedges(text_grouped) + textedges.generate(text_grouped) # select relevant edges - # generate table areas using relevant edges and horizontal text - table_bbox = textedges.generate_tableareas() + relevant_textedges = textedges.get_relevant() + # guess table areas using relevant edges + table_bbox = textedges.get_table_areas(relevant_textedges) # treat whole page as table if not table areas found if not len(table_bbox): table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}