Refactor core
parent
ffeb853c55
commit
557189da24
203
camelot/core.py
203
camelot/core.py
|
|
@ -16,64 +16,47 @@ class Cell(object):
|
||||||
self.lt = (x1, y2)
|
self.lt = (x1, y2)
|
||||||
self.rb = (x2, y1)
|
self.rb = (x2, y1)
|
||||||
self.rt = (x2, y2)
|
self.rt = (x2, y2)
|
||||||
self.bbox = (x1, y1, x2, y2)
|
|
||||||
self.left = False
|
self.left = False
|
||||||
self.right = False
|
self.right = False
|
||||||
self.top = False
|
self.top = False
|
||||||
self.bottom = False
|
self.bottom = False
|
||||||
self.text_objects = []
|
self.hspan = False
|
||||||
self.text = ''
|
self.vspan = False
|
||||||
self.spanning_h = False
|
self._text = ''
|
||||||
self.spanning_v = False
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
pass
|
return '<Cell x1={} y1={} x2={} y2={}'.format(
|
||||||
|
self.x1, self.y1, self.x2, self.y2)
|
||||||
|
|
||||||
def add_text(self, text):
|
@property
|
||||||
|
def text(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
@text.setter
|
||||||
|
def text(self, t):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
text
|
t
|
||||||
"""
|
"""
|
||||||
self.text = ''.join([self.text, text])
|
self._text = ''.join([self._text, t])
|
||||||
|
|
||||||
def get_text(self):
|
@property
|
||||||
|
def bound(self):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return self.text
|
return self.top + self.bottom + self.left + self.right
|
||||||
|
|
||||||
def add_object(self, t_object):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
t_object
|
|
||||||
"""
|
|
||||||
self.text_objects.append(t_object)
|
|
||||||
|
|
||||||
def get_objects(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
return self.text_objects
|
|
||||||
|
|
||||||
def get_bounded_edges(self):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.bounded_edges = self.top + self.bottom + self.left + self.right
|
|
||||||
return self.bounded_edges
|
|
||||||
|
|
||||||
|
|
||||||
class Table(object):
|
class Table(object):
|
||||||
|
|
@ -95,22 +78,7 @@ class Table(object):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
|
return '<{} shape={}>'.format(self.__class__.__name__, self._shape)
|
||||||
|
|
||||||
def set_all_edges(self):
|
def set_border(self):
|
||||||
"""
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
|
|
||||||
"""
|
|
||||||
for r in range(len(self.rows)):
|
|
||||||
for c in range(len(self.cols)):
|
|
||||||
self.cells[r][c].left = True
|
|
||||||
self.cells[r][c].right = True
|
|
||||||
self.cells[r][c].top = True
|
|
||||||
self.cells[r][c].bottom = True
|
|
||||||
return self
|
|
||||||
|
|
||||||
def set_border_edges(self):
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
|
|
@ -125,6 +93,18 @@ class Table(object):
|
||||||
self.cells[len(self.rows) - 1][c].bottom = True
|
self.cells[len(self.rows) - 1][c].bottom = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def set_all_edges(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
|
||||||
|
"""
|
||||||
|
for row in self.cells:
|
||||||
|
for cell in row:
|
||||||
|
cell.left = cell.right = cell.top = cell.bottom = True
|
||||||
|
return self
|
||||||
|
|
||||||
def set_edges(self, vertical, horizontal, jtol=2):
|
def set_edges(self, vertical, horizontal, jtol=2):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -140,7 +120,7 @@ class Table(object):
|
||||||
"""
|
"""
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
# find closest x coord
|
# find closest x coord
|
||||||
# iterate over y coords and find closest points
|
# iterate over y coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.cols)
|
i = [i for i, t in enumerate(self.cols)
|
||||||
if np.isclose(v[0], t[0], atol=jtol)]
|
if np.isclose(v[0], t[0], atol=jtol)]
|
||||||
j = [j for j, t in enumerate(self.rows)
|
j = [j for j, t in enumerate(self.rows)
|
||||||
|
|
@ -148,51 +128,50 @@ class Table(object):
|
||||||
k = [k for k, t in enumerate(self.rows)
|
k = [k for k, t in enumerate(self.rows)
|
||||||
if np.isclose(v[1], t[0], atol=jtol)]
|
if np.isclose(v[1], t[0], atol=jtol)]
|
||||||
if not j:
|
if not j:
|
||||||
self.nocont_ += 1
|
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
if i == [0]: # only left edge
|
if i == [0]: # only left edge
|
||||||
I = i[0]
|
L = i[0]
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].left = True
|
self.cells[J][L].left = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.rows)
|
K = len(self.rows)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].left = True
|
self.cells[J][L].left = True
|
||||||
J += 1
|
J += 1
|
||||||
elif i == []: # only right edge
|
elif i == []: # only right edge
|
||||||
I = len(self.cols) - 1
|
L = len(self.cols) - 1
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].right = True
|
self.cells[J][L].right = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.rows)
|
K = len(self.rows)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].right = True
|
self.cells[J][L].right = True
|
||||||
J += 1
|
J += 1
|
||||||
else: # both left and right edges
|
else: # both left and right edges
|
||||||
I = i[0]
|
L = i[0]
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].left = True
|
self.cells[J][L].left = True
|
||||||
self.cells[J][I - 1].right = True
|
self.cells[J][L - 1].right = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.rows)
|
K = len(self.rows)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[J][I].left = True
|
self.cells[J][L].left = True
|
||||||
self.cells[J][I - 1].right = True
|
self.cells[J][L - 1].right = True
|
||||||
J += 1
|
J += 1
|
||||||
|
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
# find closest y coord
|
# find closest y coord
|
||||||
# iterate over x coords and find closest points
|
# iterate over x coords and find closest start and end points
|
||||||
i = [i for i, t in enumerate(self.rows)
|
i = [i for i, t in enumerate(self.rows)
|
||||||
if np.isclose(h[1], t[0], atol=jtol)]
|
if np.isclose(h[1], t[0], atol=jtol)]
|
||||||
j = [j for j, t in enumerate(self.cols)
|
j = [j for j, t in enumerate(self.cols)
|
||||||
|
|
@ -200,93 +179,78 @@ class Table(object):
|
||||||
k = [k for k, t in enumerate(self.cols)
|
k = [k for k, t in enumerate(self.cols)
|
||||||
if np.isclose(h[2], t[0], atol=jtol)]
|
if np.isclose(h[2], t[0], atol=jtol)]
|
||||||
if not j:
|
if not j:
|
||||||
self.nocont_ += 1
|
|
||||||
continue
|
continue
|
||||||
J = j[0]
|
J = j[0]
|
||||||
if i == [0]: # only top edge
|
if i == [0]: # only top edge
|
||||||
I = i[0]
|
L = i[0]
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].top = True
|
self.cells[L][J].top = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.cols)
|
K = len(self.cols)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].top = True
|
self.cells[L][J].top = True
|
||||||
J += 1
|
J += 1
|
||||||
elif i == []: # only bottom edge
|
elif i == []: # only bottom edge
|
||||||
I = len(self.rows) - 1
|
I = len(self.rows) - 1
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].bottom = True
|
self.cells[L][J].bottom = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.cols)
|
K = len(self.cols)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].bottom = True
|
self.cells[L][J].bottom = True
|
||||||
J += 1
|
J += 1
|
||||||
else: # both top and bottom edges
|
else: # both top and bottom edges
|
||||||
I = i[0]
|
L = i[0]
|
||||||
if k:
|
if k:
|
||||||
K = k[0]
|
K = k[0]
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].top = True
|
self.cells[L][J].top = True
|
||||||
self.cells[I - 1][J].bottom = True
|
self.cells[L - 1][J].bottom = True
|
||||||
J += 1
|
J += 1
|
||||||
else:
|
else:
|
||||||
K = len(self.cols)
|
K = len(self.cols)
|
||||||
while J < K:
|
while J < K:
|
||||||
self.cells[I][J].top = True
|
self.cells[L][J].top = True
|
||||||
self.cells[I - 1][J].bottom = True
|
self.cells[L - 1][J].bottom = True
|
||||||
J += 1
|
J += 1
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_spanning(self):
|
def set_span(self):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for r in range(len(self.rows)):
|
for row in self.cells:
|
||||||
for c in range(len(self.cols)):
|
for cell in row:
|
||||||
bound = self.cells[r][c].get_bounded_edges()
|
left = cell.left
|
||||||
if bound == 4:
|
right = cell.right
|
||||||
|
top = cell.top
|
||||||
|
bottom = cell.bottom
|
||||||
|
if cell.bound == 4:
|
||||||
continue
|
continue
|
||||||
elif bound == 3:
|
elif cell.bound == 3:
|
||||||
if not self.cells[r][c].left:
|
if not left and (right and top and bottom):
|
||||||
if (self.cells[r][c].right and
|
cell.hspan = True
|
||||||
self.cells[r][c].top and
|
elif not right and (left and top and bottom):
|
||||||
self.cells[r][c].bottom):
|
cell.hspan = True
|
||||||
self.cells[r][c].spanning_h = True
|
elif not top and (left and right and bottom):
|
||||||
elif not self.cells[r][c].right:
|
cell.vspan = True
|
||||||
if (self.cells[r][c].left and
|
elif not bottom and (left and right and top):
|
||||||
self.cells[r][c].top and
|
cell.vspan = True
|
||||||
self.cells[r][c].bottom):
|
elif cell.bound == 2:
|
||||||
self.cells[r][c].spanning_h = True
|
if left and right and (not top and not bottom):
|
||||||
elif not self.cells[r][c].top:
|
cell.vspan = True
|
||||||
if (self.cells[r][c].left and
|
elif top and bottom and (not left and not right):
|
||||||
self.cells[r][c].right and
|
cell.hspan = True
|
||||||
self.cells[r][c].bottom):
|
|
||||||
self.cells[r][c].spanning_v = True
|
|
||||||
elif not self.cells[r][c].bottom:
|
|
||||||
if (self.cells[r][c].left and
|
|
||||||
self.cells[r][c].right and
|
|
||||||
self.cells[r][c].top):
|
|
||||||
self.cells[r][c].spanning_v = True
|
|
||||||
elif bound == 2:
|
|
||||||
if self.cells[r][c].left and self.cells[r][c].right:
|
|
||||||
if (not self.cells[r][c].top and
|
|
||||||
not self.cells[r][c].bottom):
|
|
||||||
self.cells[r][c].spanning_v = True
|
|
||||||
elif self.cells[r][c].top and self.cells[r][c].bottom:
|
|
||||||
if (not self.cells[r][c].left and
|
|
||||||
not self.cells[r][c].right):
|
|
||||||
self.cells[r][c].spanning_h = True
|
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -298,9 +262,8 @@ class Table(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
d = []
|
d = []
|
||||||
for r in range(len(self.rows)):
|
for row in self.cells:
|
||||||
d.append([self.cells[r][c].get_text().strip()
|
d.append([cell.text.strip() for cell in row])
|
||||||
for c in range(len(self.cols))])
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -405,7 +368,7 @@ class Table(object):
|
||||||
return report
|
return report
|
||||||
|
|
||||||
|
|
||||||
class TableList(list):
|
class TableList(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,7 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import copy
|
import copy
|
||||||
import types
|
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
|
||||||
import warnings
|
import warnings
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
|
@ -52,19 +49,19 @@ class Lattice(BaseParser):
|
||||||
for r_idx, c_idx, text in idx:
|
for r_idx, c_idx, text in idx:
|
||||||
for d in shift_text:
|
for d in shift_text:
|
||||||
if d == 'l':
|
if d == 'l':
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].hspan:
|
||||||
while not t.cells[r_idx][c_idx].left:
|
while not t.cells[r_idx][c_idx].left:
|
||||||
c_idx -= 1
|
c_idx -= 1
|
||||||
if d == 'r':
|
if d == 'r':
|
||||||
if t.cells[r_idx][c_idx].spanning_h:
|
if t.cells[r_idx][c_idx].hspan:
|
||||||
while not t.cells[r_idx][c_idx].right:
|
while not t.cells[r_idx][c_idx].right:
|
||||||
c_idx += 1
|
c_idx += 1
|
||||||
if d == 't':
|
if d == 't':
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
if t.cells[r_idx][c_idx].vspan:
|
||||||
while not t.cells[r_idx][c_idx].top:
|
while not t.cells[r_idx][c_idx].top:
|
||||||
r_idx -= 1
|
r_idx -= 1
|
||||||
if d == 'b':
|
if d == 'b':
|
||||||
if t.cells[r_idx][c_idx].spanning_v:
|
if t.cells[r_idx][c_idx].vspan:
|
||||||
while not t.cells[r_idx][c_idx].bottom:
|
while not t.cells[r_idx][c_idx].bottom:
|
||||||
r_idx += 1
|
r_idx += 1
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
|
|
@ -76,15 +73,15 @@ class Lattice(BaseParser):
|
||||||
if f == "h":
|
if f == "h":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(t.cells[i])):
|
||||||
if t.cells[i][j].get_text().strip() == '':
|
if t.cells[i][j].text.strip() == '':
|
||||||
if t.cells[i][j].spanning_h and not t.cells[i][j].left:
|
if t.cells[i][j].hspan and not t.cells[i][j].left:
|
||||||
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
|
t.cells[i][j].text = t.cells[i][j - 1].text
|
||||||
elif f == "v":
|
elif f == "v":
|
||||||
for i in range(len(t.cells)):
|
for i in range(len(t.cells)):
|
||||||
for j in range(len(t.cells[i])):
|
for j in range(len(t.cells[i])):
|
||||||
if t.cells[i][j].get_text().strip() == '':
|
if t.cells[i][j].text.strip() == '':
|
||||||
if t.cells[i][j].spanning_v and not t.cells[i][j].top:
|
if t.cells[i][j].vspan and not t.cells[i][j].top:
|
||||||
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
|
t.cells[i][j].text = t.cells[i - 1][j].text
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def _generate_image(self):
|
def _generate_image(self):
|
||||||
|
|
@ -173,9 +170,9 @@ class Lattice(BaseParser):
|
||||||
# set table edges to True using ver+hor lines
|
# set table edges to True using ver+hor lines
|
||||||
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
table = table.set_edges(v_s, h_s, jtol=self.jtol)
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_spanning()
|
table = table.set_span()
|
||||||
# set table border edges to True
|
# set table border edges to True
|
||||||
table = table.set_border_edges()
|
table = table.set_border()
|
||||||
|
|
||||||
pos_errors = []
|
pos_errors = []
|
||||||
for direction in self.t_bbox:
|
for direction in self.t_bbox:
|
||||||
|
|
@ -187,7 +184,7 @@ class Lattice(BaseParser):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
indices = Lattice._reduce_index(table, indices, shift_text=self.shift_text)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].add_text(text)
|
table.cells[r_idx][c_idx].text = text
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
if self.fill is not None:
|
if self.fill is not None:
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,7 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import copy
|
|
||||||
import types
|
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
|
||||||
import warnings
|
import warnings
|
||||||
import subprocess
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -206,7 +201,7 @@ class Stream(BaseParser):
|
||||||
if indices[:2] != (-1, -1):
|
if indices[:2] != (-1, -1):
|
||||||
pos_errors.append(error)
|
pos_errors.append(error)
|
||||||
for r_idx, c_idx, text in indices:
|
for r_idx, c_idx, text in indices:
|
||||||
table.cells[r_idx][c_idx].add_text(text)
|
table.cells[r_idx][c_idx].text = text
|
||||||
accuracy = compute_accuracy([[100, pos_errors]])
|
accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
data = table.data
|
data = table.data
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue