camelot-py/camelot/utils.py

582 lines
15 KiB
Python

from __future__ import division
import os
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal, LTTextLineVertical
def translate(x1, x2):
"""Translates x2 by x1.
Parameters
----------
x1 : float
x2 : float
Returns
-------
x2 : float
"""
x2 += x1
return x2
def scale(x, s):
"""Scales x by scaling factor s.
Parameters
----------
x : float
s : float
Returns
-------
x : float
"""
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
"""Rotates point x2, y2 about point x1, y1 by angle.
Parameters
----------
x1 : float
y1 : float
x2 : float
y2 : float
angle : float
Angle in radians.
Returns
-------
xnew : float
ynew : float
"""
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew
def scale_to_image(k, factors):
"""Translates and scales PDFMiner coordinates to OpenCV's coordinate
space.
Parameters
----------
k : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
space.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
first two elements are scaling factors and pdf_y is height of
pdf.
Returns
-------
knew : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
space.
"""
x1, y1, x2, y2 = k
scaling_factor_x, scaling_factor_y, pdf_y = factors
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
knew = (int(x1), int(y1), int(x2), int(y2))
return knew
def scale_to_pdf(tables, v_segments, h_segments, factors):
"""Translates and scales OpenCV coordinates to PDFMiner's coordinate
space.
Parameters
----------
tables : dict
Dict with table boundaries as keys and list of intersections
in that boundary as their value.
v_segments : list
List of vertical line segments.
h_segments : list
List of horizontal line segments.
factors : tuple
Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
first two elements are scaling factors and img_y is height of
image.
Returns
-------
tables_new : dict
v_segments_new : dict
h_segments_new : dict
"""
scaling_factor_x, scaling_factor_y, img_y = factors
tables_new = {}
for k in tables.keys():
x1, y1, x2, y2 = k
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
v_segments_new = []
for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
abs(translate(-img_y, v[3])), scaling_factor_y)
v_segments_new.append((x1, y1, x2, y2))
h_segments_new = []
for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
return tables_new, v_segments_new, h_segments_new
def get_rotation(ltchar, lttextlh=None, lttextlv=None):
"""Detects if text in table is vertical or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters
----------
ltchar : list
List of PDFMiner LTChar objects.
lttextlh : list
List of PDFMiner LTTextLineHorizontal objects.
(optional, default: None)
lttextlv : list
List of PDFMiner LTTextLineVertical objects.
(optional, default: None)
Returns
-------
rotation : string
{'', 'left', 'right'}
'' if text in table is upright, 'left' if rotated 90 degree
anti-clockwise and 'right' if rotated 90 degree clockwise.
"""
rotation = ''
if lttextlh is not None and lttextlv is not None:
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
vger = 0.0
else:
hlen = len([t for t in ltchar if t.upright and t.get_text().strip()])
vlen = len([t for t in ltchar if (not t.upright) and t.get_text().strip()])
vger = vlen / float(hlen+vlen)
if hlen < vlen or vger > 0.8:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
rotation = 'left' if clockwise < anticlockwise else 'right'
return rotation
def segments_bbox(bbox, v_segments, h_segments):
"""Returns all line segments present inside a
table's bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
v_segments : list
List of vertical line segments.
h_segments : list
List of vertical horizontal segments.
Returns
-------
v_s : list
List of vertical line segments that lie inside table.
h_s : list
List of horizontal line segments that lie inside table.
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and
h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
return v_s, h_s
def text_bbox(bbox, text):
"""Returns all text objects present inside a
table's bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing table bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
text : list
List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects that lie inside table.
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
<= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
<= rt[1] + 2]
return t_bbox
def remove_close_values(ar, mtol=2):
"""Removes values which are within a tolerance of mtol of another value
present in list.
Parameters
----------
ar : list
mtol : int
(optional, default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar, mtol=2):
"""Merges values which are within a tolerance of mtol by calculating
a moving mean.
Parameters
----------
ar : list
mtol : int
(optional, default: 2)
Returns
-------
ret : list
"""
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_index(t, rows):
"""Gets index of the row in which the given text object lies by
comparing their y-coordinates.
Parameters
----------
t : object
rows : list
List of row coordinate tuples, sorted in decreasing order.
Returns
-------
r : int
error : float
"""
offset1, offset2 = 0, 0
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
if t.y0 > rows[r][0]:
offset1 = abs(t.y0 - rows[r][0])
if t.y1 < rows[r][1]:
offset2 = abs(t.y1 - rows[r][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (X * (offset1 + offset2)) / charea
return r, error
def get_column_index(t, columns):
"""Gets index of the column in which the given text object lies by
comparing their x-coordinates.
Parameters
----------
t : object
columns : list
List of column coordinate tuples.
Returns
-------
c : int
error : float
"""
offset1, offset2 = 0, 0
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
if t.x0 < columns[c][0]:
offset1 = abs(t.x0 - columns[c][0])
if t.x1 > columns[c][1]:
offset2 = abs(t.x1 - columns[c][1])
X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1)
Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1)
charea = X * Y
error = (Y * (offset1 + offset2)) / charea
return c, error
def get_score(error_weights):
"""Calculates score based on weights assigned to various parameters,
and their error percentages.
Parameters
----------
error_weights : list
Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
where pn is the weight assigned to list of errors en.
Sum of pn should be equal to 100.
Returns
-------
score : float
"""
SCORE_VAL = 100
score = 0
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
raise ValueError("Please assign a valid weightage to each parameter"
" such that their sum is equal to 100")
for ew in error_weights:
weight = ew[0] / len(ew[1])
for error_percentage in ew[1]:
score += weight * (1 - error_percentage)
return score
def remove_empty(d):
"""Removes empty rows and columns from a two-dimensional list.
Parameters
----------
d : list
Returns
-------
d : list
"""
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d
def count_empty(d):
"""Counts empty rows and columns in a two-dimensional list.
Parameters
----------
d : list
Returns
-------
n_empty_rows : number of empty rows
n_empty_cols : number of empty columns
empty_p : percentage of empty cells
"""
empty_p = 0
r_nempty_cells, c_nempty_cells = [], []
for i in d:
for j in i:
if j.strip() == '':
empty_p += 1
empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
for row in d:
r_nempty_c = 0
for r in row:
if r.strip() != '':
r_nempty_c += 1
r_nempty_cells.append(r_nempty_c)
d = zip(*d)
d = [list(col) for col in d]
for col in d:
c_nempty_c = 0
for c in col:
if c.strip() != '':
c_nempty_c += 1
c_nempty_cells.append(c_nempty_c)
return empty_p, r_nempty_cells, c_nempty_cells
def encode_list(ar):
"""Encodes list of text.
Parameters
----------
ar : list
Returns
-------
ar : list
"""
ar = [[r.encode('utf-8') for r in row] for row in ar]
return ar
def get_text_objects(layout, LTType="char", t=None):
"""Recursively parses pdf layout to get a list of
text objects.
Parameters
----------
layout : object
PDFMiner LTPage object.
LTType : string
{'char', 'lh', 'lv'}
Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
and LTTextLineVertical objects respectively.
t : list
Returns
-------
t : list
List of PDFMiner text objects.
"""
if LTType == "char":
LTObject = LTChar
elif LTType == "lh":
LTObject = LTTextLineHorizontal
elif LTType == "lv":
LTObject = LTTextLineVertical
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += get_text_objects(obj, LTType=LTType)
except AttributeError:
pass
return t
def get_page_layout(pname, char_margin=2.0, line_margin=0.5, word_margin=0.1,
detect_vertical=True, all_texts=True):
"""Returns a PDFMiner LTPage object and page dimension of a single
page pdf. See https://euske.github.io/pdfminer/ to get definitions
of kwargs.
Parameters
----------
pname : string
Path to pdf file.
char_margin : float
line_margin : float
word_margin : float
detect_vertical : bool
all_texts : bool
Returns
-------
layout : object
PDFMiner LTPage object.
dim : tuple
pdf page dimension of the form (width, height).
"""
with open(pname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams(char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin,
detect_vertical=detect_vertical,
all_texts=all_texts)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
width = layout.bbox[2]
height = layout.bbox[3]
dim = (width, height)
return layout, dim