From b2dd5f68fe0ceb5b09963f4d7d64663fb71f97af Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 1 Sep 2016 01:42:27 +0530 Subject: [PATCH] Fix vertical text detection in cells * Fix vertical text detection in cells * Add Cell instance method * Change var names --- camelot/cell.py | 7 +++++++ camelot/lattice.py | 49 +++++++++++++++++++++++++++------------------- camelot/utils.py | 34 +++++++++++++++++++------------- 3 files changed, 56 insertions(+), 34 deletions(-) diff --git a/camelot/cell.py b/camelot/cell.py index 515878d..bf01cf0 100644 --- a/camelot/cell.py +++ b/camelot/cell.py @@ -54,6 +54,7 @@ class Cell: self.top = False self.bottom = False self.text = '' + self.text_objects = [] self.spanning_h = False self.spanning_v = False @@ -75,6 +76,12 @@ class Cell: """ return self.text + def add_object(self, t_object): + self.text_objects.append(t_object) + + def get_objects(self): + return self.text_objects + def get_bounded_edges(self): """Returns number of edges by which a cell is bounded. diff --git a/camelot/lattice.py b/camelot/lattice.py index c68520e..b31a1a1 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -10,7 +10,7 @@ import numpy as np from wand.image import Image from .table import Table -from .utils import (transform, elements_bbox, detect_vertical, merge_close_values, +from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values, get_row_index, get_column_index, get_score, reduce_index, outline, fill_spanning, count_empty, encode_list, pdf_to_text) @@ -247,10 +247,10 @@ class Lattice: for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): # select edges which lie within table_bbox table_info = {} - text_bbox, v_s, h_s = elements_bbox(k, text, v_segments, - h_segments) - table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text))) - rotated = detect_vertical(text_bbox) + v_s, h_s = segments_bbox(k, v_segments, h_segments) + t_bbox = text_bbox(k, text) + table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text))) + table_rotation = detect_vertical(t_bbox) cols, rows = zip(*table_bbox[k]) cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) @@ -277,17 +277,9 @@ class Lattice: if self.debug: self.debug_tables.append(table) - # fill text after sorting it - if rotated == '': - text_bbox.sort(key=lambda x: (-x.y0, x.x0)) - elif rotated == 'left': - text_bbox.sort(key=lambda x: (x.x0, x.y0)) - elif rotated == 'right': - text_bbox.sort(key=lambda x: (-x.x0, -x.y0)) - rerror = [] cerror = [] - for t in text_bbox: + for t in text: try: r_idx, rass_error = get_row_index(t, rows) except TypeError: @@ -300,19 +292,36 @@ class Lattice: continue rerror.append(rass_error) cerror.append(cass_error) - r_idx, c_idx = reduce_index( - table, rotated, r_idx, c_idx) - table.cells[r_idx][c_idx].add_text( - t.get_text().strip('\n')) + r_idx, c_idx = reduce_index(table, table_rotation, r_idx, c_idx) + table.cells[r_idx][c_idx].add_object(t) + + for i in range(len(table.cells)): + for j in range(len(table.cells[i])): + t_bbox = table.cells[i][j].get_objects() + try: + cell_rotation = detect_vertical(t_bbox) + except ZeroDivisionError: + cell_rotation = '' + pass + # fill text after sorting it + if cell_rotation == '': + t_bbox.sort(key=lambda x: (-x.y0, x.x0)) + elif cell_rotation == 'left': + t_bbox.sort(key=lambda x: (x.x0, x.y0)) + elif cell_rotation == 'right': + t_bbox.sort(key=lambda x: (-x.x0, -x.y0)) + table.cells[i][j].add_text(''.join([t.get_text() + for t in t_bbox])) + score = get_score([[50, rerror], [50, cerror]]) table_info['score'] = score if self.fill is not None: table = fill_spanning(table, fill=self.fill) ar = table.get_list() - if rotated == 'left': + if table_rotation == 'left': ar = zip(*ar[::-1]) - elif rotated == 'right': + elif table_rotation == 'right': ar = zip(*ar[::1]) ar.reverse() ar = encode_list(ar) diff --git a/camelot/utils.py b/camelot/utils.py index 29d82b4..99b7524 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -144,20 +144,20 @@ def detect_vertical(text): Returns ------- - rotated : string + rotation : string """ num_v = [t for t in text if (not t.upright) and t.get_text().strip()] num_h = [t for t in text if t.upright and t.get_text().strip()] vger = len(num_v) / float(len(num_v) + len(num_h)) - rotated = '' + rotation = '' if vger > 0.8: clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text) anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text) - rotated = 'left' if clockwise < anticlockwise else 'right' - return rotated + rotation = 'left' if clockwise < anticlockwise else 'right' + return rotation -def elements_bbox(bbox, text, v_segments, h_segments): +def segments_bbox(bbox, v_segments, h_segments): """Returns all text objects and line segments present inside a table's bounding box. @@ -181,14 +181,20 @@ def elements_bbox(bbox, text, v_segments, h_segments): """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) - text_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 - <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 - <= rt[1] + 2] v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] - return text_bbox, v_s, h_s + return v_s, h_s + + +def text_bbox(bbox, text): + lb = (bbox[0], bbox[1]) + rt = (bbox[2], bbox[3]) + t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 + <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 + <= rt[1] + 2] + return t_bbox def remove_close_values(ar, mtol=2): @@ -331,7 +337,7 @@ def get_score(error_weights): return score -def reduce_index(t, rotated, r_idx, c_idx): +def reduce_index(t, rotation, r_idx, c_idx): """Reduces index of a text object if it lies within a spanning cell taking in account table rotation. @@ -339,7 +345,7 @@ def reduce_index(t, rotated, r_idx, c_idx): ---------- t : object - rotated : string + rotation : string r_idx : int @@ -351,21 +357,21 @@ def reduce_index(t, rotated, r_idx, c_idx): c_idx : int """ - if not rotated: + if not rotation: if t.cells[r_idx][c_idx].spanning_h: while not t.cells[r_idx][c_idx].left: c_idx -= 1 if t.cells[r_idx][c_idx].spanning_v: while not t.cells[r_idx][c_idx].top: r_idx -= 1 - elif rotated == 'left': + elif rotation == 'left': if t.cells[r_idx][c_idx].spanning_h: while not t.cells[r_idx][c_idx].left: c_idx -= 1 if t.cells[r_idx][c_idx].spanning_v: while not t.cells[r_idx][c_idx].bottom: r_idx += 1 - elif rotated == 'right': + elif rotation == 'right': if t.cells[r_idx][c_idx].spanning_h: while not t.cells[r_idx][c_idx].right: c_idx += 1