Add deepcopy and debug scripts
parent
4dd0d2330e
commit
84d354ba10
|
|
@ -2,7 +2,7 @@ import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, invert=False):
|
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -15,6 +15,15 @@ def adaptive_threshold(imagename, invert=False):
|
||||||
tables with lines in background.
|
tables with lines in background.
|
||||||
(optional, default: False)
|
(optional, default: False)
|
||||||
|
|
||||||
|
blocksize: int
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
|
||||||
|
c: float
|
||||||
|
Constant subtracted from the mean or weighted mean
|
||||||
|
(see the details below). Normally, it is positive but may be
|
||||||
|
zero or negative as well.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
img : object
|
img : object
|
||||||
|
|
@ -27,14 +36,11 @@ def adaptive_threshold(imagename, invert=False):
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
if invert:
|
if invert:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
|
cv2.THRESH_BINARY, blocksize, c)
|
||||||
15, -0.2)
|
|
||||||
else:
|
else:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
|
||||||
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
|
||||||
cv2.THRESH_BINARY,
|
|
||||||
15, -0.2)
|
|
||||||
return img, threshold
|
return img, threshold
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import copy
|
||||||
import types
|
import types
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
|
@ -269,7 +270,9 @@ class Lattice:
|
||||||
table_bbox = find_table_joints(contours, vmask, hmask)
|
table_bbox = find_table_joints(contours, vmask, hmask)
|
||||||
|
|
||||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||||
mtolerance = self.mtol * len(table_bbox)
|
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||||
|
else:
|
||||||
|
mtolerance = copy.deepcopy(self.mtol)
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_images = (img, table_bbox)
|
self.debug_images = (img, table_bbox)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import copy
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import pyocr
|
import pyocr
|
||||||
|
|
@ -100,7 +101,9 @@ class OCR:
|
||||||
self.debug_tables = []
|
self.debug_tables = []
|
||||||
|
|
||||||
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
if len(self.mtol) == 1 and self.mtol[0] == 2:
|
||||||
self.mtol = self.mtol * len(table_bbox)
|
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||||
|
else:
|
||||||
|
mtolerance = copy.deepcopy(self.mtol)
|
||||||
|
|
||||||
page = {}
|
page = {}
|
||||||
tables = {}
|
tables = {}
|
||||||
|
|
@ -111,8 +114,8 @@ class OCR:
|
||||||
cols, rows = list(cols), list(rows)
|
cols, rows = list(cols), list(rows)
|
||||||
cols.extend([k[0], k[2]])
|
cols.extend([k[0], k[2]])
|
||||||
rows.extend([k[1], k[3]])
|
rows.extend([k[1], k[3]])
|
||||||
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
|
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
|
||||||
rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no])
|
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
|
||||||
cols = [(cols[i], cols[i + 1])
|
cols = [(cols[i], cols[i + 1])
|
||||||
for i in range(0, len(cols) - 1)]
|
for i in range(0, len(cols) - 1)]
|
||||||
rows = [(rows[i], rows[i + 1])
|
rows = [(rows[i], rows[i + 1])
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import os
|
import os
|
||||||
|
import copy
|
||||||
import types
|
import types
|
||||||
import logging
|
import logging
|
||||||
import copy_reg
|
import copy_reg
|
||||||
|
|
@ -332,9 +333,13 @@ class Stream:
|
||||||
table_bbox = {(0, 0, width, height): None}
|
table_bbox = {(0, 0, width, height): None}
|
||||||
|
|
||||||
if len(self.ytol) == 1 and self.ytol[0] == 2:
|
if len(self.ytol) == 1 and self.ytol[0] == 2:
|
||||||
ytolerance = self.ytol * len(table_bbox)
|
ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
|
||||||
|
else:
|
||||||
|
ytolerance = copy.deepcopy(self.ytol)
|
||||||
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
if len(self.mtol) == 1 and self.mtol[0] == 0:
|
||||||
mtolerance = self.mtol * len(table_bbox)
|
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
|
||||||
|
else:
|
||||||
|
mtolerance = copy.deepcopy(self.mtol)
|
||||||
|
|
||||||
page = {}
|
page = {}
|
||||||
tables = {}
|
tables = {}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,53 @@
|
||||||
|
"""
|
||||||
|
usage: python hough_opencv.py file.png
|
||||||
|
|
||||||
|
find lines present in an image using opencv's hough transform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
image = cv2.imread(sys.argv[1])
|
||||||
|
print "image dimensions -> {0}".format(image.shape)
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||||
|
|
||||||
|
lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
|
||||||
|
print "found {0} lines".format(len(lines))
|
||||||
|
for line in lines:
|
||||||
|
r, theta = line[0]
|
||||||
|
# filter horizontal and vertical lines
|
||||||
|
if theta == 0 or np.isclose(theta, np.pi / 2):
|
||||||
|
x0 = r * np.cos(theta)
|
||||||
|
y0 = r * np.sin(theta)
|
||||||
|
x1 = int(x0 + 10000 * (-np.sin(theta)))
|
||||||
|
y1 = int(y0 + 10000 * (np.cos(theta)))
|
||||||
|
x2 = int(x0 - 10000 * (-np.sin(theta)))
|
||||||
|
y2 = int(y0 - 10000 * (np.cos(theta)))
|
||||||
|
cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5)
|
||||||
|
plt.imshow(image)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
"""
|
||||||
|
usage: python hough_skimage.py file.png
|
||||||
|
|
||||||
|
find lines present in an image using scikit-image's hough transform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from scipy.misc import imread
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from skimage.transform import hough_line, hough_line_peaks
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
image = cv2.imread(sys.argv[1])
|
||||||
|
print "image dimensions -> {0}".format(image.shape)
|
||||||
|
ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
|
||||||
|
binary = np.min(binary, axis=2)
|
||||||
|
binary = np.where(binary == 255, 0, 255)
|
||||||
|
rows, cols = binary.shape
|
||||||
|
pixel = np.zeros(binary.shape)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(1, 1, figsize=(8,4))
|
||||||
|
ax.imshow(image, cmap=plt.cm.gray)
|
||||||
|
|
||||||
|
theta_in = np.linspace(0, np.pi / 2, 10)
|
||||||
|
h, theta, d = hough_line(binary, theta_in)
|
||||||
|
for _, angle, dist in zip(*hough_line_peaks(h, theta, d)):
|
||||||
|
x0 = dist * np.cos(angle)
|
||||||
|
y0 = dist * np.sin(angle)
|
||||||
|
x1 = int(x0 + 1000 * (-np.sin(angle)))
|
||||||
|
y1 = int(y0 + 1000 * (np.cos(angle)))
|
||||||
|
x2 = int(x0 - 1000 * (-np.sin(angle)))
|
||||||
|
y2 = int(y0 - 1000 * (np.cos(angle)))
|
||||||
|
ax.plot((x1, x2), (y1, y2), '-r')
|
||||||
|
a = np.cos(angle)
|
||||||
|
b = np.sin(angle)
|
||||||
|
x = np.arange(binary.shape[1])
|
||||||
|
y = np.arange(binary.shape[0])
|
||||||
|
x = a * x
|
||||||
|
y = b * y
|
||||||
|
R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1]))))
|
||||||
|
pixel += np.isclose(R, np.round(dist))
|
||||||
|
|
||||||
|
pixel = np.clip(pixel, 0, 1)
|
||||||
|
pixel = np.where(pixel == 1, 0, 1)
|
||||||
|
binary = np.where(binary == 0, 255, 0)
|
||||||
|
binary *= pixel.astype(np.int64)
|
||||||
|
ax.imshow(binary, cmap=plt.cm.gray)
|
||||||
|
ax.axis((0, cols, rows, 0))
|
||||||
|
ax.set_title('Detected lines')
|
||||||
|
ax.set_axis_off()
|
||||||
|
ax.set_adjustable('box-forced')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
"""
|
||||||
|
usage: python hough_prob.py file.png
|
||||||
|
|
||||||
|
find lines present in an image using scikit-image's hough transform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from scipy.misc import imread
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from skimage.feature import canny
|
||||||
|
from skimage.transform import probabilistic_hough_line
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
image = imread(sys.argv[1], mode='L')
|
||||||
|
edges = canny(image, 2, 1, 25)
|
||||||
|
lines = probabilistic_hough_line(edges, threshold=1000)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True)
|
||||||
|
ax.imshow(edges * 0)
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
p0, p1 = line
|
||||||
|
ax.plot((p0[0], p1[0]), (p0[1], p1[1]))
|
||||||
|
|
||||||
|
ax.set_title('Probabilistic Hough')
|
||||||
|
ax.set_axis_off()
|
||||||
|
ax.set_adjustable('box-forced')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,103 @@
|
||||||
|
"""
|
||||||
|
usage: python morph_transform.py file.png
|
||||||
|
|
||||||
|
find lines present in an image using opencv's morph transform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
def mt(imagename, scale=40):
|
||||||
|
img = cv2.imread(imagename)
|
||||||
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||||
|
vertical = threshold
|
||||||
|
horizontal = threshold
|
||||||
|
|
||||||
|
verticalsize = vertical.shape[0] / scale
|
||||||
|
horizontalsize = horizontal.shape[1] / scale
|
||||||
|
|
||||||
|
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||||
|
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||||
|
|
||||||
|
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||||
|
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||||
|
|
||||||
|
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||||
|
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||||
|
|
||||||
|
mask = vertical + horizontal
|
||||||
|
joints = np.bitwise_and(vertical, horizontal)
|
||||||
|
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||||
|
|
||||||
|
tables = {}
|
||||||
|
for c in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(c)
|
||||||
|
x1, x2 = x, x + w
|
||||||
|
y1, y2 = y, y + h
|
||||||
|
# find number of non-zero values in joints using what boundingRect returns
|
||||||
|
roi = joints[y:y+h, x:x+w]
|
||||||
|
jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||||
|
continue
|
||||||
|
joint_coords = []
|
||||||
|
for j in jc:
|
||||||
|
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||||
|
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
|
||||||
|
joint_coords.append((c1, c2))
|
||||||
|
tables[(x1, y2, x2, y1)] = joint_coords
|
||||||
|
|
||||||
|
vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
for vc in vcontours:
|
||||||
|
x, y, w, h = cv2.boundingRect(vc)
|
||||||
|
x1, x2 = x, x + w
|
||||||
|
y1, y2 = y, y + h
|
||||||
|
plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1])
|
||||||
|
|
||||||
|
hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
for hc in hcontours:
|
||||||
|
x, y, w, h = cv2.boundingRect(hc)
|
||||||
|
x1, x2 = x, x + w
|
||||||
|
y1, y2 = y, y + h
|
||||||
|
plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2])
|
||||||
|
|
||||||
|
x_coord = []
|
||||||
|
y_coord = []
|
||||||
|
for k in tables.keys():
|
||||||
|
for coord in tables[k]:
|
||||||
|
x_coord.append(coord[0])
|
||||||
|
y_coord.append(coord[1])
|
||||||
|
plt.plot(x_coord, y_coord, 'ro')
|
||||||
|
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
return tables
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
t = mt(sys.argv[1])
|
||||||
|
print 'tables found: ', len(t.keys())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,167 @@
|
||||||
|
"""
|
||||||
|
usage: python plot_geo.py file.pdf
|
||||||
|
python plot_geo.py file.pdf file.png
|
||||||
|
|
||||||
|
print lines and rectangles present in a pdf file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.patches as patches
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
|
from pdfminer.layout import LAParams, LTLine, LTRect
|
||||||
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||||
|
|
||||||
|
|
||||||
|
MIN_LENGTH = 1
|
||||||
|
pdf_x, pdf_y, image_x, image_y = [0] * 4
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
def remove_coords(coords):
|
||||||
|
merged = []
|
||||||
|
for coord in coords:
|
||||||
|
if not merged:
|
||||||
|
merged.append(coord)
|
||||||
|
else:
|
||||||
|
last = merged[-1]
|
||||||
|
if np.isclose(last, coord, atol=2):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
merged.append(coord)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def parse_layout(pdfname):
|
||||||
|
global pdf_x, pdf_y
|
||||||
|
def is_horizontal(line):
|
||||||
|
if line[0] == line[2]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_vertical(line):
|
||||||
|
if line[1] == line[3]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
vertical, horizontal = [], []
|
||||||
|
with open(pdfname, 'rb') as f:
|
||||||
|
parser = PDFParser(f)
|
||||||
|
document = PDFDocument(parser)
|
||||||
|
if not document.is_extractable:
|
||||||
|
raise PDFTextExtractionNotAllowed
|
||||||
|
laparams = LAParams()
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in PDFPage.create_pages(document):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
||||||
|
for obj in layout._objs:
|
||||||
|
if isinstance(obj, LTLine):
|
||||||
|
line = (obj.x0, obj.y0, obj.x1, obj.y1)
|
||||||
|
if is_vertical(line):
|
||||||
|
vertical.append(line)
|
||||||
|
elif is_horizontal(line):
|
||||||
|
horizontal.append(line)
|
||||||
|
elif isinstance(obj, LTRect):
|
||||||
|
vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
|
||||||
|
vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
|
||||||
|
horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
|
||||||
|
horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
|
||||||
|
return vertical, horizontal
|
||||||
|
|
||||||
|
|
||||||
|
def hough_transform(imagename):
|
||||||
|
global pdf_x, pdf_y, image_x, image_y
|
||||||
|
img = cv2.imread(imagename)
|
||||||
|
image_x, image_y = img.shape[1], img.shape[0]
|
||||||
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||||
|
lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
|
||||||
|
x = []
|
||||||
|
for line in lines:
|
||||||
|
r, theta = line[0]
|
||||||
|
x0 = r * np.cos(theta)
|
||||||
|
x0 *= pdf_x / float(image_x)
|
||||||
|
x.append(x0)
|
||||||
|
y = []
|
||||||
|
for line in lines:
|
||||||
|
r, theta = line[0]
|
||||||
|
y0 = r * np.sin(theta)
|
||||||
|
y0 = abs(y0 - image_y)
|
||||||
|
y0 *= pdf_y / float(image_y)
|
||||||
|
y.append(y0)
|
||||||
|
x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
|
||||||
|
y = remove_coords(sorted(set(y), reverse=True))
|
||||||
|
return x, y
|
||||||
|
|
||||||
|
|
||||||
|
def plot_lines1(vertical, horizontal):
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
ax.set_xlim(0, 1000)
|
||||||
|
ax.set_ylim(0, 1000)
|
||||||
|
|
||||||
|
vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
|
||||||
|
horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
|
||||||
|
for v in vertical:
|
||||||
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
for h in horizontal:
|
||||||
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_lines2(imagename, vertical, horizontal):
|
||||||
|
x, y = hough_transform(imagename)
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect='equal')
|
||||||
|
ax.set_xlim(0, 1000)
|
||||||
|
ax.set_ylim(0, 1000)
|
||||||
|
|
||||||
|
for x0 in x:
|
||||||
|
for v in vertical:
|
||||||
|
if np.isclose(x0, v[0], atol=2):
|
||||||
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
|
for y0 in y:
|
||||||
|
for h in horizontal:
|
||||||
|
if np.isclose(y0, h[1], atol=2):
|
||||||
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
vertical, horizontal = parse_layout(sys.argv[1])
|
||||||
|
if len(sys.argv) == 2:
|
||||||
|
plot_lines1(vertical, horizontal)
|
||||||
|
elif len(sys.argv) == 3:
|
||||||
|
plot_lines1(vertical, horizontal)
|
||||||
|
plot_lines2(sys.argv[2], vertical, horizontal)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,83 @@
|
||||||
|
"""
|
||||||
|
usage: python print_text.py file.pdf
|
||||||
|
|
||||||
|
prints horizontal and vertical text lines present in a pdf file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||||
|
from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
|
||||||
|
LTTextLineHorizontal, LTTextLineVertical, LTLine)
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_objects(layout, LTObject, t=None):
|
||||||
|
if t is None:
|
||||||
|
t = []
|
||||||
|
try:
|
||||||
|
for obj in layout._objs:
|
||||||
|
if isinstance(obj, LTObject):
|
||||||
|
t.append(obj)
|
||||||
|
else:
|
||||||
|
t += extract_text_objects(obj, LTObject)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
with open(sys.argv[1], 'rb') as f:
|
||||||
|
parser = PDFParser(f)
|
||||||
|
document = PDFDocument(parser)
|
||||||
|
if not document.is_extractable:
|
||||||
|
raise PDFTextExtractionNotAllowed
|
||||||
|
# 2.0, 0.5, 0.1
|
||||||
|
kwargs = {
|
||||||
|
'char_margin': 1.0,
|
||||||
|
'line_margin': 0.5,
|
||||||
|
'word_margin': 0.1,
|
||||||
|
'detect_vertical': True
|
||||||
|
}
|
||||||
|
laparams = LAParams(**kwargs)
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in PDFPage.create_pages(document):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
lh = extract_text_objects(layout, LTTextLineHorizontal)
|
||||||
|
lv = extract_text_objects(layout, LTTextLineVertical)
|
||||||
|
print "number of horizontal text lines -> {0}".format(len(lh))
|
||||||
|
print "horizontal text lines ->"
|
||||||
|
pprint([t.get_text() for t in lh])
|
||||||
|
print "number of vertical text lines -> {0}".format(len(lv))
|
||||||
|
print "vertical text lines ->"
|
||||||
|
pprint([t.get_text() for t in lv])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue