From 557189da2434794dbac8e7a2ae8d703aaa45f43e Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Thu, 6 Sep 2018 07:42:41 +0530 Subject: [PATCH] Refactor core --- camelot/core.py | 203 +++++++++++++++---------------------- camelot/parsers/lattice.py | 29 +++--- camelot/parsers/stream.py | 7 +- 3 files changed, 97 insertions(+), 142 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 87cb6ae..9f98f16 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -16,64 +16,47 @@ class Cell(object): self.lt = (x1, y2) self.rb = (x2, y1) self.rt = (x2, y2) - self.bbox = (x1, y1, x2, y2) self.left = False self.right = False self.top = False self.bottom = False - self.text_objects = [] - self.text = '' - self.spanning_h = False - self.spanning_v = False + self.hspan = False + self.vspan = False + self._text = '' def __repr__(self): - pass + return ''.format(self.__class__.__name__, self._shape) - def set_all_edges(self): - """ - - Returns - ------- - - """ - for r in range(len(self.rows)): - for c in range(len(self.cols)): - self.cells[r][c].left = True - self.cells[r][c].right = True - self.cells[r][c].top = True - self.cells[r][c].bottom = True - return self - - def set_border_edges(self): + def set_border(self): """ Returns @@ -125,6 +93,18 @@ class Table(object): self.cells[len(self.rows) - 1][c].bottom = True return self + def set_all_edges(self): + """ + + Returns + ------- + + """ + for row in self.cells: + for cell in row: + cell.left = cell.right = cell.top = cell.bottom = True + return self + def set_edges(self, vertical, horizontal, jtol=2): """ @@ -140,7 +120,7 @@ class Table(object): """ for v in vertical: # find closest x coord - # iterate over y coords and find closest points + # iterate over y coords and find closest start and end points i = [i for i, t in enumerate(self.cols) if np.isclose(v[0], t[0], atol=jtol)] j = [j for j, t in enumerate(self.rows) @@ -148,51 +128,50 @@ class Table(object): k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=jtol)] if not j: - self.nocont_ += 1 continue J = j[0] if i == [0]: # only left edge - I = i[0] + L = i[0] if k: K = k[0] while J < K: - self.cells[J][I].left = True + self.cells[J][L].left = True J += 1 else: K = len(self.rows) while J < K: - self.cells[J][I].left = True + self.cells[J][L].left = True J += 1 elif i == []: # only right edge - I = len(self.cols) - 1 + L = len(self.cols) - 1 if k: K = k[0] while J < K: - self.cells[J][I].right = True + self.cells[J][L].right = True J += 1 else: K = len(self.rows) while J < K: - self.cells[J][I].right = True + self.cells[J][L].right = True J += 1 else: # both left and right edges - I = i[0] + L = i[0] if k: K = k[0] while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True + self.cells[J][L].left = True + self.cells[J][L - 1].right = True J += 1 else: K = len(self.rows) while J < K: - self.cells[J][I].left = True - self.cells[J][I - 1].right = True + self.cells[J][L].left = True + self.cells[J][L - 1].right = True J += 1 for h in horizontal: # find closest y coord - # iterate over x coords and find closest points + # iterate over x coords and find closest start and end points i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=jtol)] j = [j for j, t in enumerate(self.cols) @@ -200,93 +179,78 @@ class Table(object): k = [k for k, t in enumerate(self.cols) if np.isclose(h[2], t[0], atol=jtol)] if not j: - self.nocont_ += 1 continue J = j[0] if i == [0]: # only top edge - I = i[0] + L = i[0] if k: K = k[0] while J < K: - self.cells[I][J].top = True + self.cells[L][J].top = True J += 1 else: K = len(self.cols) while J < K: - self.cells[I][J].top = True + self.cells[L][J].top = True J += 1 elif i == []: # only bottom edge I = len(self.rows) - 1 if k: K = k[0] while J < K: - self.cells[I][J].bottom = True + self.cells[L][J].bottom = True J += 1 else: K = len(self.cols) while J < K: - self.cells[I][J].bottom = True + self.cells[L][J].bottom = True J += 1 else: # both top and bottom edges - I = i[0] + L = i[0] if k: K = k[0] while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True J += 1 else: K = len(self.cols) while J < K: - self.cells[I][J].top = True - self.cells[I - 1][J].bottom = True + self.cells[L][J].top = True + self.cells[L - 1][J].bottom = True J += 1 return self - def set_spanning(self): + def set_span(self): """ Returns ------- """ - for r in range(len(self.rows)): - for c in range(len(self.cols)): - bound = self.cells[r][c].get_bounded_edges() - if bound == 4: + for row in self.cells: + for cell in row: + left = cell.left + right = cell.right + top = cell.top + bottom = cell.bottom + if cell.bound == 4: continue - elif bound == 3: - if not self.cells[r][c].left: - if (self.cells[r][c].right and - self.cells[r][c].top and - self.cells[r][c].bottom): - self.cells[r][c].spanning_h = True - elif not self.cells[r][c].right: - if (self.cells[r][c].left and - self.cells[r][c].top and - self.cells[r][c].bottom): - self.cells[r][c].spanning_h = True - elif not self.cells[r][c].top: - if (self.cells[r][c].left and - self.cells[r][c].right and - self.cells[r][c].bottom): - self.cells[r][c].spanning_v = True - elif not self.cells[r][c].bottom: - if (self.cells[r][c].left and - self.cells[r][c].right and - self.cells[r][c].top): - self.cells[r][c].spanning_v = True - elif bound == 2: - if self.cells[r][c].left and self.cells[r][c].right: - if (not self.cells[r][c].top and - not self.cells[r][c].bottom): - self.cells[r][c].spanning_v = True - elif self.cells[r][c].top and self.cells[r][c].bottom: - if (not self.cells[r][c].left and - not self.cells[r][c].right): - self.cells[r][c].spanning_h = True - + elif cell.bound == 3: + if not left and (right and top and bottom): + cell.hspan = True + elif not right and (left and top and bottom): + cell.hspan = True + elif not top and (left and right and bottom): + cell.vspan = True + elif not bottom and (left and right and top): + cell.vspan = True + elif cell.bound == 2: + if left and right and (not top and not bottom): + cell.vspan = True + elif top and bottom and (not left and not right): + cell.hspan = True return self @property @@ -298,9 +262,8 @@ class Table(object): """ d = [] - for r in range(len(self.rows)): - d.append([self.cells[r][c].get_text().strip() - for c in range(len(self.cols))]) + for row in self.cells: + d.append([cell.text.strip() for cell in row]) return d @property @@ -405,7 +368,7 @@ class Table(object): return report -class TableList(list): +class TableList(object): """ """ diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index a8067ab..282a96a 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -1,10 +1,7 @@ from __future__ import division import os -import sys import copy -import types import logging -import copy_reg import warnings import subprocess @@ -52,19 +49,19 @@ class Lattice(BaseParser): for r_idx, c_idx, text in idx: for d in shift_text: if d == 'l': - if t.cells[r_idx][c_idx].spanning_h: + if t.cells[r_idx][c_idx].hspan: while not t.cells[r_idx][c_idx].left: c_idx -= 1 if d == 'r': - if t.cells[r_idx][c_idx].spanning_h: + if t.cells[r_idx][c_idx].hspan: while not t.cells[r_idx][c_idx].right: c_idx += 1 if d == 't': - if t.cells[r_idx][c_idx].spanning_v: + if t.cells[r_idx][c_idx].vspan: while not t.cells[r_idx][c_idx].top: r_idx -= 1 if d == 'b': - if t.cells[r_idx][c_idx].spanning_v: + if t.cells[r_idx][c_idx].vspan: while not t.cells[r_idx][c_idx].bottom: r_idx += 1 indices.append((r_idx, c_idx, text)) @@ -76,15 +73,15 @@ class Lattice(BaseParser): if f == "h": for i in range(len(t.cells)): for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_h and not t.cells[i][j].left: - t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) + if t.cells[i][j].text.strip() == '': + if t.cells[i][j].hspan and not t.cells[i][j].left: + t.cells[i][j].text = t.cells[i][j - 1].text elif f == "v": for i in range(len(t.cells)): for j in range(len(t.cells[i])): - if t.cells[i][j].get_text().strip() == '': - if t.cells[i][j].spanning_v and not t.cells[i][j].top: - t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) + if t.cells[i][j].text.strip() == '': + if t.cells[i][j].vspan and not t.cells[i][j].top: + t.cells[i][j].text = t.cells[i - 1][j].text return t def _generate_image(self): @@ -173,9 +170,9 @@ class Lattice(BaseParser): # set table edges to True using ver+hor lines table = table.set_edges(v_s, h_s, jtol=self.jtol) # set spanning cells to True - table = table.set_spanning() + table = table.set_span() # set table border edges to True - table = table.set_border_edges() + table = table.set_border() pos_errors = [] for direction in self.t_bbox: @@ -187,7 +184,7 @@ class Lattice(BaseParser): pos_errors.append(error) indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].add_text(text) + table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) if self.fill is not None: diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 3daa9e6..1849a0c 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -1,12 +1,7 @@ from __future__ import division import os -import sys -import copy -import types import logging -import copy_reg import warnings -import subprocess import numpy as np import pandas as pd @@ -206,7 +201,7 @@ class Stream(BaseParser): if indices[:2] != (-1, -1): pos_errors.append(error) for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].add_text(text) + table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) data = table.data