Add deepcopy and debug scripts

pull/2/head
Vinayak Mehta 2017-04-10 18:59:48 +05:30
parent 4dd0d2330e
commit 84d354ba10
11 changed files with 568 additions and 21 deletions

View File

@ -2,7 +2,7 @@ import cv2
import numpy as np import numpy as np
def adaptive_threshold(imagename, invert=False): def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
@ -15,6 +15,15 @@ def adaptive_threshold(imagename, invert=False):
tables with lines in background. tables with lines in background.
(optional, default: False) (optional, default: False)
blocksize: int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
c: float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
Returns Returns
------- -------
img : object img : object
@ -27,14 +36,11 @@ def adaptive_threshold(imagename, invert=False):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if invert: if invert:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, cv2.THRESH_BINARY, blocksize, c)
15, -0.2)
else: else:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
cv2.THRESH_BINARY,
15, -0.2)
return img, threshold return img, threshold

View File

@ -1,6 +1,7 @@
from __future__ import division from __future__ import division
import os import os
import sys import sys
import copy
import types import types
import logging import logging
import copy_reg import copy_reg
@ -269,7 +270,9 @@ class Lattice:
table_bbox = find_table_joints(contours, vmask, hmask) table_bbox = find_table_joints(contours, vmask, hmask)
if len(self.mtol) == 1 and self.mtol[0] == 2: if len(self.mtol) == 1 and self.mtol[0] == 2:
mtolerance = self.mtol * len(table_bbox) mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
mtolerance = copy.deepcopy(self.mtol)
if self.debug: if self.debug:
self.debug_images = (img, table_bbox) self.debug_images = (img, table_bbox)

View File

@ -1,4 +1,5 @@
import os import os
import copy
import subprocess import subprocess
import pyocr import pyocr
@ -100,7 +101,9 @@ class OCR:
self.debug_tables = [] self.debug_tables = []
if len(self.mtol) == 1 and self.mtol[0] == 2: if len(self.mtol) == 1 and self.mtol[0] == 2:
self.mtol = self.mtol * len(table_bbox) mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
mtolerance = copy.deepcopy(self.mtol)
page = {} page = {}
tables = {} tables = {}
@ -111,8 +114,8 @@ class OCR:
cols, rows = list(cols), list(rows) cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]]) cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]]) rows.extend([k[1], k[3]])
cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no]) rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
cols = [(cols[i], cols[i + 1]) cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)] for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1]) rows = [(rows[i], rows[i + 1])

View File

@ -1,5 +1,6 @@
from __future__ import division from __future__ import division
import os import os
import copy
import types import types
import logging import logging
import copy_reg import copy_reg
@ -332,9 +333,13 @@ class Stream:
table_bbox = {(0, 0, width, height): None} table_bbox = {(0, 0, width, height): None}
if len(self.ytol) == 1 and self.ytol[0] == 2: if len(self.ytol) == 1 and self.ytol[0] == 2:
ytolerance = self.ytol * len(table_bbox) ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
else:
ytolerance = copy.deepcopy(self.ytol)
if len(self.mtol) == 1 and self.mtol[0] == 0: if len(self.mtol) == 1 and self.mtol[0] == 0:
mtolerance = self.mtol * len(table_bbox) mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
mtolerance = copy.deepcopy(self.mtol)
page = {} page = {}
tables = {} tables = {}

View File

@ -0,0 +1,53 @@
"""
usage: python hough_opencv.py file.png
find lines present in an image using opencv's hough transform.
"""
import sys
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
@timeit
def main():
image = cv2.imread(sys.argv[1])
print "image dimensions -> {0}".format(image.shape)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
print "found {0} lines".format(len(lines))
for line in lines:
r, theta = line[0]
# filter horizontal and vertical lines
if theta == 0 or np.isclose(theta, np.pi / 2):
x0 = r * np.cos(theta)
y0 = r * np.sin(theta)
x1 = int(x0 + 10000 * (-np.sin(theta)))
y1 = int(y0 + 10000 * (np.cos(theta)))
x2 = int(x0 - 10000 * (-np.sin(theta)))
y2 = int(y0 - 10000 * (np.cos(theta)))
cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5)
plt.imshow(image)
plt.show()
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -0,0 +1,75 @@
"""
usage: python hough_skimage.py file.png
find lines present in an image using scikit-image's hough transform.
"""
import sys
import time
import cv2
import numpy as np
from scipy.misc import imread
import matplotlib.pyplot as plt
from skimage.transform import hough_line, hough_line_peaks
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
@timeit
def main():
image = cv2.imread(sys.argv[1])
print "image dimensions -> {0}".format(image.shape)
ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
binary = np.min(binary, axis=2)
binary = np.where(binary == 255, 0, 255)
rows, cols = binary.shape
pixel = np.zeros(binary.shape)
fig, ax = plt.subplots(1, 1, figsize=(8,4))
ax.imshow(image, cmap=plt.cm.gray)
theta_in = np.linspace(0, np.pi / 2, 10)
h, theta, d = hough_line(binary, theta_in)
for _, angle, dist in zip(*hough_line_peaks(h, theta, d)):
x0 = dist * np.cos(angle)
y0 = dist * np.sin(angle)
x1 = int(x0 + 1000 * (-np.sin(angle)))
y1 = int(y0 + 1000 * (np.cos(angle)))
x2 = int(x0 - 1000 * (-np.sin(angle)))
y2 = int(y0 - 1000 * (np.cos(angle)))
ax.plot((x1, x2), (y1, y2), '-r')
a = np.cos(angle)
b = np.sin(angle)
x = np.arange(binary.shape[1])
y = np.arange(binary.shape[0])
x = a * x
y = b * y
R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1]))))
pixel += np.isclose(R, np.round(dist))
pixel = np.clip(pixel, 0, 1)
pixel = np.where(pixel == 1, 0, 1)
binary = np.where(binary == 0, 255, 0)
binary *= pixel.astype(np.int64)
ax.imshow(binary, cmap=plt.cm.gray)
ax.axis((0, cols, rows, 0))
ax.set_title('Detected lines')
ax.set_axis_off()
ax.set_adjustable('box-forced')
plt.show()
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -0,0 +1,49 @@
"""
usage: python hough_prob.py file.png
find lines present in an image using scikit-image's hough transform.
"""
import sys
import time
from scipy.misc import imread
import matplotlib.pyplot as plt
from skimage.feature import canny
from skimage.transform import probabilistic_hough_line
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
@timeit
def main():
image = imread(sys.argv[1], mode='L')
edges = canny(image, 2, 1, 25)
lines = probabilistic_hough_line(edges, threshold=1000)
fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True)
ax.imshow(edges * 0)
for line in lines:
p0, p1 = line
ax.plot((p0[0], p1[0]), (p0[1], p1[1]))
ax.set_title('Probabilistic Hough')
ax.set_axis_off()
ax.set_adjustable('box-forced')
plt.show()
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -0,0 +1,103 @@
"""
usage: python morph_transform.py file.png
find lines present in an image using opencv's morph transform.
"""
import sys
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
def mt(imagename, scale=40):
img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
vertical = threshold
horizontal = threshold
verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
x, y, w, h = cv2.boundingRect(c)
x1, x2 = x, x + w
y1, y2 = y, y + h
# find number of non-zero values in joints using what boundingRect returns
roi = joints[y:y+h, x:x+w]
jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
joint_coords.append((c1, c2))
tables[(x1, y2, x2, y1)] = joint_coords
vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, x2 = x, x + w
y1, y2 = y, y + h
plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1])
hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, x2 = x, x + w
y1, y2 = y, y + h
plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2])
x_coord = []
y_coord = []
for k in tables.keys():
for coord in tables[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
plt.plot(x_coord, y_coord, 'ro')
plt.imshow(img)
plt.show()
return tables
@timeit
def main():
t = mt(sys.argv[1])
print 'tables found: ', len(t.keys())
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -0,0 +1,167 @@
"""
usage: python plot_geo.py file.pdf
python plot_geo.py file.pdf file.png
print lines and rectangles present in a pdf file.
"""
import sys
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams, LTLine, LTRect
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
MIN_LENGTH = 1
pdf_x, pdf_y, image_x, image_y = [0] * 4
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
def remove_coords(coords):
merged = []
for coord in coords:
if not merged:
merged.append(coord)
else:
last = merged[-1]
if np.isclose(last, coord, atol=2):
pass
else:
merged.append(coord)
return merged
def parse_layout(pdfname):
global pdf_x, pdf_y
def is_horizontal(line):
if line[0] == line[2]:
return True
return False
def is_vertical(line):
if line[1] == line[3]:
return True
return False
vertical, horizontal = [], []
with open(pdfname, 'rb') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams()
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
for obj in layout._objs:
if isinstance(obj, LTLine):
line = (obj.x0, obj.y0, obj.x1, obj.y1)
if is_vertical(line):
vertical.append(line)
elif is_horizontal(line):
horizontal.append(line)
elif isinstance(obj, LTRect):
vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
return vertical, horizontal
def hough_transform(imagename):
global pdf_x, pdf_y, image_x, image_y
img = cv2.imread(imagename)
image_x, image_y = img.shape[1], img.shape[0]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
x = []
for line in lines:
r, theta = line[0]
x0 = r * np.cos(theta)
x0 *= pdf_x / float(image_x)
x.append(x0)
y = []
for line in lines:
r, theta = line[0]
y0 = r * np.sin(theta)
y0 = abs(y0 - image_y)
y0 *= pdf_y / float(image_y)
y.append(y0)
x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
y = remove_coords(sorted(set(y), reverse=True))
return x, y
def plot_lines1(vertical, horizontal):
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.set_xlim(0, 1000)
ax.set_ylim(0, 1000)
vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
def plot_lines2(imagename, vertical, horizontal):
x, y = hough_transform(imagename)
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.set_xlim(0, 1000)
ax.set_ylim(0, 1000)
for x0 in x:
for v in vertical:
if np.isclose(x0, v[0], atol=2):
ax.plot([v[0], v[2]], [v[1], v[3]])
for y0 in y:
for h in horizontal:
if np.isclose(y0, h[1], atol=2):
ax.plot([h[0], h[2]], [h[1], h[3]])
plt.show()
@timeit
def main():
vertical, horizontal = parse_layout(sys.argv[1])
if len(sys.argv) == 2:
plot_lines1(vertical, horizontal)
elif len(sys.argv) == 3:
plot_lines1(vertical, horizontal)
plot_lines2(sys.argv[2], vertical, horizontal)
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -0,0 +1,83 @@
"""
usage: python print_text.py file.pdf
prints horizontal and vertical text lines present in a pdf file.
"""
import sys
import time
from pprint import pprint
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
LTTextLineHorizontal, LTTextLineVertical, LTLine)
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
def extract_text_objects(layout, LTObject, t=None):
if t is None:
t = []
try:
for obj in layout._objs:
if isinstance(obj, LTObject):
t.append(obj)
else:
t += extract_text_objects(obj, LTObject)
except AttributeError:
pass
return t
@timeit
def main():
with open(sys.argv[1], 'rb') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# 2.0, 0.5, 0.1
kwargs = {
'char_margin': 1.0,
'line_margin': 0.5,
'word_margin': 0.1,
'detect_vertical': True
}
laparams = LAParams(**kwargs)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
lh = extract_text_objects(layout, LTTextLineHorizontal)
lv = extract_text_objects(layout, LTTextLineVertical)
print "number of horizontal text lines -> {0}".format(len(lh))
print "horizontal text lines ->"
pprint([t.get_text() for t in lh])
print "number of vertical text lines -> {0}".format(len(lv))
print "vertical text lines ->"
pprint([t.get_text() for t in lv])
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()