Refactor core

pull/2/head
Vinayak Mehta 2018-09-06 07:42:41 +05:30
parent ffeb853c55
commit 557189da24
3 changed files with 97 additions and 142 deletions

View File

@ -16,64 +16,47 @@ class Cell(object):
self.lt = (x1, y2) self.lt = (x1, y2)
self.rb = (x2, y1) self.rb = (x2, y1)
self.rt = (x2, y2) self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False self.left = False
self.right = False self.right = False
self.top = False self.top = False
self.bottom = False self.bottom = False
self.text_objects = [] self.hspan = False
self.text = '' self.vspan = False
self.spanning_h = False self._text = ''
self.spanning_v = False
def __repr__(self): def __repr__(self):
pass return '<Cell x1={} y1={} x2={} y2={}'.format(
self.x1, self.y1, self.x2, self.y2)
def add_text(self, text): @property
def text(self):
"""
Returns
-------
"""
return self._text
@text.setter
def text(self, t):
""" """
Parameters Parameters
---------- ----------
text t
""" """
self.text = ''.join([self.text, text]) self._text = ''.join([self._text, t])
def get_text(self): @property
def bound(self):
""" """
Returns Returns
------- -------
""" """
return self.text return self.top + self.bottom + self.left + self.right
def add_object(self, t_object):
"""
Parameters
----------
t_object
"""
self.text_objects.append(t_object)
def get_objects(self):
"""
Returns
-------
"""
return self.text_objects
def get_bounded_edges(self):
"""
Returns
-------
"""
self.bounded_edges = self.top + self.bottom + self.left + self.right
return self.bounded_edges
class Table(object): class Table(object):
@ -95,22 +78,7 @@ class Table(object):
def __repr__(self): def __repr__(self):
return '<{} shape={}>'.format(self.__class__.__name__, self._shape) return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
def set_all_edges(self): def set_border(self):
"""
Returns
-------
"""
for r in range(len(self.rows)):
for c in range(len(self.cols)):
self.cells[r][c].left = True
self.cells[r][c].right = True
self.cells[r][c].top = True
self.cells[r][c].bottom = True
return self
def set_border_edges(self):
""" """
Returns Returns
@ -125,6 +93,18 @@ class Table(object):
self.cells[len(self.rows) - 1][c].bottom = True self.cells[len(self.rows) - 1][c].bottom = True
return self return self
def set_all_edges(self):
"""
Returns
-------
"""
for row in self.cells:
for cell in row:
cell.left = cell.right = cell.top = cell.bottom = True
return self
def set_edges(self, vertical, horizontal, jtol=2): def set_edges(self, vertical, horizontal, jtol=2):
""" """
@ -140,7 +120,7 @@ class Table(object):
""" """
for v in vertical: for v in vertical:
# find closest x coord # find closest x coord
# iterate over y coords and find closest points # iterate over y coords and find closest start and end points
i = [i for i, t in enumerate(self.cols) i = [i for i, t in enumerate(self.cols)
if np.isclose(v[0], t[0], atol=jtol)] if np.isclose(v[0], t[0], atol=jtol)]
j = [j for j, t in enumerate(self.rows) j = [j for j, t in enumerate(self.rows)
@ -148,51 +128,50 @@ class Table(object):
k = [k for k, t in enumerate(self.rows) k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=jtol)] if np.isclose(v[1], t[0], atol=jtol)]
if not j: if not j:
self.nocont_ += 1
continue continue
J = j[0] J = j[0]
if i == [0]: # only left edge if i == [0]: # only left edge
I = i[0] L = i[0]
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][L].left = True
J += 1 J += 1
else: else:
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][L].left = True
J += 1 J += 1
elif i == []: # only right edge elif i == []: # only right edge
I = len(self.cols) - 1 L = len(self.cols) - 1
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].right = True self.cells[J][L].right = True
J += 1 J += 1
else: else:
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].right = True self.cells[J][L].right = True
J += 1 J += 1
else: # both left and right edges else: # both left and right edges
I = i[0] L = i[0]
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][L].left = True
self.cells[J][I - 1].right = True self.cells[J][L - 1].right = True
J += 1 J += 1
else: else:
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][L].left = True
self.cells[J][I - 1].right = True self.cells[J][L - 1].right = True
J += 1 J += 1
for h in horizontal: for h in horizontal:
# find closest y coord # find closest y coord
# iterate over x coords and find closest points # iterate over x coords and find closest start and end points
i = [i for i, t in enumerate(self.rows) i = [i for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=jtol)] if np.isclose(h[1], t[0], atol=jtol)]
j = [j for j, t in enumerate(self.cols) j = [j for j, t in enumerate(self.cols)
@ -200,93 +179,78 @@ class Table(object):
k = [k for k, t in enumerate(self.cols) k = [k for k, t in enumerate(self.cols)
if np.isclose(h[2], t[0], atol=jtol)] if np.isclose(h[2], t[0], atol=jtol)]
if not j: if not j:
self.nocont_ += 1
continue continue
J = j[0] J = j[0]
if i == [0]: # only top edge if i == [0]: # only top edge
I = i[0] L = i[0]
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].top = True self.cells[L][J].top = True
J += 1 J += 1
else: else:
K = len(self.cols) K = len(self.cols)
while J < K: while J < K:
self.cells[I][J].top = True self.cells[L][J].top = True
J += 1 J += 1
elif i == []: # only bottom edge elif i == []: # only bottom edge
I = len(self.rows) - 1 I = len(self.rows) - 1
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].bottom = True self.cells[L][J].bottom = True
J += 1 J += 1
else: else:
K = len(self.cols) K = len(self.cols)
while J < K: while J < K:
self.cells[I][J].bottom = True self.cells[L][J].bottom = True
J += 1 J += 1
else: # both top and bottom edges else: # both top and bottom edges
I = i[0] L = i[0]
if k: if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].top = True self.cells[L][J].top = True
self.cells[I - 1][J].bottom = True self.cells[L - 1][J].bottom = True
J += 1 J += 1
else: else:
K = len(self.cols) K = len(self.cols)
while J < K: while J < K:
self.cells[I][J].top = True self.cells[L][J].top = True
self.cells[I - 1][J].bottom = True self.cells[L - 1][J].bottom = True
J += 1 J += 1
return self return self
def set_spanning(self): def set_span(self):
""" """
Returns Returns
------- -------
""" """
for r in range(len(self.rows)): for row in self.cells:
for c in range(len(self.cols)): for cell in row:
bound = self.cells[r][c].get_bounded_edges() left = cell.left
if bound == 4: right = cell.right
top = cell.top
bottom = cell.bottom
if cell.bound == 4:
continue continue
elif bound == 3: elif cell.bound == 3:
if not self.cells[r][c].left: if not left and (right and top and bottom):
if (self.cells[r][c].right and cell.hspan = True
self.cells[r][c].top and elif not right and (left and top and bottom):
self.cells[r][c].bottom): cell.hspan = True
self.cells[r][c].spanning_h = True elif not top and (left and right and bottom):
elif not self.cells[r][c].right: cell.vspan = True
if (self.cells[r][c].left and elif not bottom and (left and right and top):
self.cells[r][c].top and cell.vspan = True
self.cells[r][c].bottom): elif cell.bound == 2:
self.cells[r][c].spanning_h = True if left and right and (not top and not bottom):
elif not self.cells[r][c].top: cell.vspan = True
if (self.cells[r][c].left and elif top and bottom and (not left and not right):
self.cells[r][c].right and cell.hspan = True
self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif not self.cells[r][c].bottom:
if (self.cells[r][c].left and
self.cells[r][c].right and
self.cells[r][c].top):
self.cells[r][c].spanning_v = True
elif bound == 2:
if self.cells[r][c].left and self.cells[r][c].right:
if (not self.cells[r][c].top and
not self.cells[r][c].bottom):
self.cells[r][c].spanning_v = True
elif self.cells[r][c].top and self.cells[r][c].bottom:
if (not self.cells[r][c].left and
not self.cells[r][c].right):
self.cells[r][c].spanning_h = True
return self return self
@property @property
@ -298,9 +262,8 @@ class Table(object):
""" """
d = [] d = []
for r in range(len(self.rows)): for row in self.cells:
d.append([self.cells[r][c].get_text().strip() d.append([cell.text.strip() for cell in row])
for c in range(len(self.cols))])
return d return d
@property @property
@ -405,7 +368,7 @@ class Table(object):
return report return report
class TableList(list): class TableList(object):
""" """
""" """

View File

@ -1,10 +1,7 @@
from __future__ import division from __future__ import division
import os import os
import sys
import copy import copy
import types
import logging import logging
import copy_reg
import warnings import warnings
import subprocess import subprocess
@ -52,19 +49,19 @@ class Lattice(BaseParser):
for r_idx, c_idx, text in idx: for r_idx, c_idx, text in idx:
for d in shift_text: for d in shift_text:
if d == 'l': if d == 'l':
if t.cells[r_idx][c_idx].spanning_h: if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].left: while not t.cells[r_idx][c_idx].left:
c_idx -= 1 c_idx -= 1
if d == 'r': if d == 'r':
if t.cells[r_idx][c_idx].spanning_h: if t.cells[r_idx][c_idx].hspan:
while not t.cells[r_idx][c_idx].right: while not t.cells[r_idx][c_idx].right:
c_idx += 1 c_idx += 1
if d == 't': if d == 't':
if t.cells[r_idx][c_idx].spanning_v: if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].top: while not t.cells[r_idx][c_idx].top:
r_idx -= 1 r_idx -= 1
if d == 'b': if d == 'b':
if t.cells[r_idx][c_idx].spanning_v: if t.cells[r_idx][c_idx].vspan:
while not t.cells[r_idx][c_idx].bottom: while not t.cells[r_idx][c_idx].bottom:
r_idx += 1 r_idx += 1
indices.append((r_idx, c_idx, text)) indices.append((r_idx, c_idx, text))
@ -76,15 +73,15 @@ class Lattice(BaseParser):
if f == "h": if f == "h":
for i in range(len(t.cells)): for i in range(len(t.cells)):
for j in range(len(t.cells[i])): for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '': if t.cells[i][j].text.strip() == '':
if t.cells[i][j].spanning_h and not t.cells[i][j].left: if t.cells[i][j].hspan and not t.cells[i][j].left:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) t.cells[i][j].text = t.cells[i][j - 1].text
elif f == "v": elif f == "v":
for i in range(len(t.cells)): for i in range(len(t.cells)):
for j in range(len(t.cells[i])): for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '': if t.cells[i][j].text.strip() == '':
if t.cells[i][j].spanning_v and not t.cells[i][j].top: if t.cells[i][j].vspan and not t.cells[i][j].top:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) t.cells[i][j].text = t.cells[i - 1][j].text
return t return t
def _generate_image(self): def _generate_image(self):
@ -173,9 +170,9 @@ class Lattice(BaseParser):
# set table edges to True using ver+hor lines # set table edges to True using ver+hor lines
table = table.set_edges(v_s, h_s, jtol=self.jtol) table = table.set_edges(v_s, h_s, jtol=self.jtol)
# set spanning cells to True # set spanning cells to True
table = table.set_spanning() table = table.set_span()
# set table border edges to True # set table border edges to True
table = table.set_border_edges() table = table.set_border()
pos_errors = [] pos_errors = []
for direction in self.t_bbox: for direction in self.t_bbox:
@ -187,7 +184,7 @@ class Lattice(BaseParser):
pos_errors.append(error) pos_errors.append(error)
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text) indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].add_text(text) table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
if self.fill is not None: if self.fill is not None:

View File

@ -1,12 +1,7 @@
from __future__ import division from __future__ import division
import os import os
import sys
import copy
import types
import logging import logging
import copy_reg
import warnings import warnings
import subprocess
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -206,7 +201,7 @@ class Stream(BaseParser):
if indices[:2] != (-1, -1): if indices[:2] != (-1, -1):
pos_errors.append(error) pos_errors.append(error)
for r_idx, c_idx, text in indices: for r_idx, c_idx, text in indices:
table.cells[r_idx][c_idx].add_text(text) table.cells[r_idx][c_idx].text = text
accuracy = compute_accuracy([[100, pos_errors]]) accuracy = compute_accuracy([[100, pos_errors]])
data = table.data data = table.data