camelot-py/camelot/utils.py

400 lines
9.0 KiB
Python

import numpy as np
def translate(x1, x2):
"""Translates x2 by x1.
Parameters
----------
x1 : float
x2 : float
Returns
-------
x2 : float
"""
x2 += x1
return x2
def scale(x, s):
"""Scales x by scaling factor s.
Parameters
----------
x : float
s : float
Returns
-------
x : float
"""
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
"""Rotates point x2, y2 about point x1, y1 by angle.
Parameters
----------
x1 : float
y1 : float
x2 : float
y2 : float
angle : float
Angle in radians.
Returns
-------
xnew : float
ynew : float
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew
def transform(tables, v_segments, h_segments, factors):
"""Translates and scales OpenCV coordinates to PDFMiner coordinate
space.
Parameters
----------
tables : dict
v_segments : list
h_segments : list
factors : tuple
Returns
-------
tables_new : dict
v_segments_new : dict
h_segments_new : dict
"""
scaling_factor_x, scaling_factor_y, img_y = factors
tables_new = {}
for k in tables.keys():
x1, y1, x2, y2 = k
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
v_segments_new = []
for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
abs(translate(-img_y, v[3])), scaling_factor_y)
v_segments_new.append((x1, y1, x2, y2))
h_segments_new = []
for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
return tables_new, v_segments_new, h_segments_new
def detect_vertical(text):
"""Detects if text in table is vertical or not and returns
its orientation.
Parameters
----------
text : list
Returns
-------
rotated : string
"""
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
num_h = [t for t in text if t.upright and t.get_text().strip()]
vger = len(num_v) / float(len(num_v) + len(num_h))
rotated = ''
if vger > 0.8:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
rotated = 'left' if clockwise < anticlockwise else 'right'
return rotated
def elements_bbox(bbox, text, v_segments, h_segments):
"""Returns all text objects and line segments present inside a
table's bounding box.
Parameters
----------
bbox : tuple
text : list
v_segments : list
h_segments : list
Returns
-------
text_bbox : list
v_s : list
h_s : list
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
text_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
<= rt[1] + 2]
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
return text_bbox, v_s, h_s
def remove_close_values(ar, mtol=2):
"""Removes values which are within a tolerance of mtol of another value
present in list.
Parameters
----------
ar : list
mtol : int
(optional, default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar, mtol=2):
"""Merges values which are within a tolerance of mtol by calculating
a moving mean.
Parameters
----------
ar : list
mtol : int
(optional, default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_index(t, rows):
"""Gets index of the row in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
rows : list
Returns
-------
r : int
"""
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_index(t, columns):
"""Gets index of the column in which the given object falls by
comparing their co-ordinates.
Parameters
----------
t : object
columns : list
Returns
-------
c : int
"""
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, rotated, r_idx, c_idx):
"""Reduces index of a text object if it lies within a spanning
cell taking in account table rotation.
Parameters
----------
t : object
rotated : string
r_idx : int
c_idx : int
Returns
-------
r_idx : int
c_idx : int
"""
if not rotated:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
elif rotated == 'left':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
elif rotated == 'right':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def outline(t):
"""Sets table border edges to True.
Parameters
----------
t : object
Returns
-------
t : object
"""
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
for i in range(len(t.cells[0])):
t.cells[0][i].top = True
t.cells[len(t.cells) - 1][i].bottom = True
return t
def fill_spanning(t, fill=None):
"""Fills spanning cells.
Parameters
----------
t : object
f : string
(optional, default: None)
Returns
-------
t : object
"""
if fill == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif fill == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif fill == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def remove_empty(d):
"""Removes empty rows and columns from list of lists.
Parameters
----------
d : list
Returns
-------
d : list
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d
def encode_list(ar):
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar