Remove examples and debug, restructure tests dir
parent
9d2708171b
commit
e0b55f0693
|
|
@ -1,53 +0,0 @@
|
|||
"""
|
||||
usage: python hough_opencv.py file.png
|
||||
|
||||
finds lines present in an image using opencv's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
print "image dimensions -> {0}".format(image.shape)
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||
|
||||
lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
|
||||
print "found {0} lines".format(len(lines))
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
# filter horizontal and vertical lines
|
||||
if theta == 0 or np.isclose(theta, np.pi / 2):
|
||||
x0 = r * np.cos(theta)
|
||||
y0 = r * np.sin(theta)
|
||||
x1 = int(x0 + 10000 * (-np.sin(theta)))
|
||||
y1 = int(y0 + 10000 * (np.cos(theta)))
|
||||
x2 = int(x0 - 10000 * (-np.sin(theta)))
|
||||
y2 = int(y0 - 10000 * (np.cos(theta)))
|
||||
cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5)
|
||||
plt.imshow(image)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
"""
|
||||
usage: python hough_skimage.py file.png
|
||||
|
||||
finds lines present in an image using scikit-image's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from scipy.misc import imread
|
||||
import matplotlib.pyplot as plt
|
||||
from skimage.transform import hough_line, hough_line_peaks
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
print "image dimensions -> {0}".format(image.shape)
|
||||
ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
|
||||
binary = np.min(binary, axis=2)
|
||||
binary = np.where(binary == 255, 0, 255)
|
||||
rows, cols = binary.shape
|
||||
pixel = np.zeros(binary.shape)
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(8,4))
|
||||
ax.imshow(image, cmap=plt.cm.gray)
|
||||
|
||||
theta_in = np.linspace(0, np.pi / 2, 10)
|
||||
h, theta, d = hough_line(binary, theta_in)
|
||||
for _, angle, dist in zip(*hough_line_peaks(h, theta, d)):
|
||||
x0 = dist * np.cos(angle)
|
||||
y0 = dist * np.sin(angle)
|
||||
x1 = int(x0 + 1000 * (-np.sin(angle)))
|
||||
y1 = int(y0 + 1000 * (np.cos(angle)))
|
||||
x2 = int(x0 - 1000 * (-np.sin(angle)))
|
||||
y2 = int(y0 - 1000 * (np.cos(angle)))
|
||||
ax.plot((x1, x2), (y1, y2), '-r')
|
||||
a = np.cos(angle)
|
||||
b = np.sin(angle)
|
||||
x = np.arange(binary.shape[1])
|
||||
y = np.arange(binary.shape[0])
|
||||
x = a * x
|
||||
y = b * y
|
||||
R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1]))))
|
||||
pixel += np.isclose(R, np.round(dist))
|
||||
|
||||
pixel = np.clip(pixel, 0, 1)
|
||||
pixel = np.where(pixel == 1, 0, 1)
|
||||
binary = np.where(binary == 0, 255, 0)
|
||||
binary *= pixel.astype(np.int64)
|
||||
ax.imshow(binary, cmap=plt.cm.gray)
|
||||
ax.axis((0, cols, rows, 0))
|
||||
ax.set_title('Detected lines')
|
||||
ax.set_axis_off()
|
||||
ax.set_adjustable('box-forced')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
"""
|
||||
usage: python hough_prob.py file.png
|
||||
|
||||
finds lines present in an image using scikit-image's hough transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
from scipy.misc import imread
|
||||
import matplotlib.pyplot as plt
|
||||
from skimage.feature import canny
|
||||
from skimage.transform import probabilistic_hough_line
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = imread(sys.argv[1], mode='L')
|
||||
edges = canny(image, 2, 1, 25)
|
||||
lines = probabilistic_hough_line(edges, threshold=1000)
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True)
|
||||
ax.imshow(edges * 0)
|
||||
|
||||
for line in lines:
|
||||
p0, p1 = line
|
||||
ax.plot((p0[0], p1[0]), (p0[1], p1[1]))
|
||||
|
||||
ax.set_title('Probabilistic Hough')
|
||||
ax.set_axis_off()
|
||||
ax.set_adjustable('box-forced')
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
"""
|
||||
usage: python morph_transform.py file.png scale={int} invert={bool}
|
||||
|
||||
finds lines present in an image using opencv's morph transform.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def mt(imagename, scale=40, invert=False):
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
if invert:
|
||||
threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
vertical = threshold
|
||||
horizontal = threshold
|
||||
|
||||
verticalsize = vertical.shape[0] / scale
|
||||
horizontalsize = horizontal.shape[1] / scale
|
||||
|
||||
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
||||
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
||||
|
||||
vertical = cv2.erode(vertical, ver, (-1, -1))
|
||||
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
||||
|
||||
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
||||
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
||||
|
||||
mask = vertical + horizontal
|
||||
joints = np.bitwise_and(vertical, horizontal)
|
||||
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
||||
|
||||
tables = {}
|
||||
for c in contours:
|
||||
x, y, w, h = cv2.boundingRect(c)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
# find number of non-zero values in joints using what boundingRect returns
|
||||
roi = joints[y:y+h, x:x+w]
|
||||
jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
||||
if len(jc) <= 4: # remove contours with less than <=4 joints
|
||||
continue
|
||||
joint_coords = []
|
||||
for j in jc:
|
||||
jx, jy, jw, jh = cv2.boundingRect(j)
|
||||
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
|
||||
joint_coords.append((c1, c2))
|
||||
tables[(x1, y2, x2, y1)] = joint_coords
|
||||
|
||||
vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for vc in vcontours:
|
||||
x, y, w, h = cv2.boundingRect(vc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1])
|
||||
|
||||
hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
for hc in hcontours:
|
||||
x, y, w, h = cv2.boundingRect(hc)
|
||||
x1, x2 = x, x + w
|
||||
y1, y2 = y, y + h
|
||||
plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2])
|
||||
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
for k in tables.keys():
|
||||
for coord in tables[k]:
|
||||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
plt.plot(x_coord, y_coord, 'ro')
|
||||
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
return tables
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
try:
|
||||
scale = int(sys.argv[2].split('=')[1])
|
||||
except IndexError:
|
||||
scale = 40
|
||||
try:
|
||||
invert = bool(sys.argv[3].split('=')[1])
|
||||
except IndexError:
|
||||
invert = False
|
||||
t = mt(sys.argv[1], scale=scale, invert=invert)
|
||||
print 'tables found: ', len(t.keys())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
"""
|
||||
usage: python plot_geo.py file.pdf
|
||||
python plot_geo.py file.pdf file.png
|
||||
|
||||
prints lines and rectangles present in a pdf file.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.layout import LAParams, LTLine, LTRect
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
|
||||
|
||||
MIN_LENGTH = 1
|
||||
pdf_x, pdf_y, image_x, image_y = [0] * 4
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def remove_coords(coords):
|
||||
merged = []
|
||||
for coord in coords:
|
||||
if not merged:
|
||||
merged.append(coord)
|
||||
else:
|
||||
last = merged[-1]
|
||||
if np.isclose(last, coord, atol=2):
|
||||
pass
|
||||
else:
|
||||
merged.append(coord)
|
||||
return merged
|
||||
|
||||
|
||||
def parse_layout(pdfname):
|
||||
global pdf_x, pdf_y
|
||||
def is_horizontal(line):
|
||||
if line[0] == line[2]:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_vertical(line):
|
||||
if line[1] == line[3]:
|
||||
return True
|
||||
return False
|
||||
|
||||
vertical, horizontal = [], []
|
||||
with open(pdfname, 'rb') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
laparams = LAParams()
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTLine):
|
||||
line = (obj.x0, obj.y0, obj.x1, obj.y1)
|
||||
if is_vertical(line):
|
||||
vertical.append(line)
|
||||
elif is_horizontal(line):
|
||||
horizontal.append(line)
|
||||
elif isinstance(obj, LTRect):
|
||||
vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
|
||||
vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
|
||||
horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
|
||||
horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
|
||||
return vertical, horizontal
|
||||
|
||||
|
||||
def hough_transform(imagename):
|
||||
global pdf_x, pdf_y, image_x, image_y
|
||||
img = cv2.imread(imagename)
|
||||
image_x, image_y = img.shape[1], img.shape[0]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||
lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
|
||||
x = []
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
x0 = r * np.cos(theta)
|
||||
x0 *= pdf_x / float(image_x)
|
||||
x.append(x0)
|
||||
y = []
|
||||
for line in lines:
|
||||
r, theta = line[0]
|
||||
y0 = r * np.sin(theta)
|
||||
y0 = abs(y0 - image_y)
|
||||
y0 *= pdf_y / float(image_y)
|
||||
y.append(y0)
|
||||
x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
|
||||
y = remove_coords(sorted(set(y), reverse=True))
|
||||
return x, y
|
||||
|
||||
|
||||
def plot_lines1(vertical, horizontal):
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax.set_xlim(0, 1000)
|
||||
ax.set_ylim(0, 1000)
|
||||
|
||||
vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
|
||||
horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
|
||||
for v in vertical:
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in horizontal:
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_lines2(imagename, vertical, horizontal):
|
||||
x, y = hough_transform(imagename)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect='equal')
|
||||
ax.set_xlim(0, 1000)
|
||||
ax.set_ylim(0, 1000)
|
||||
|
||||
for x0 in x:
|
||||
for v in vertical:
|
||||
if np.isclose(x0, v[0], atol=2):
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for y0 in y:
|
||||
for h in horizontal:
|
||||
if np.isclose(y0, h[1], atol=2):
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
plt.show()
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
vertical, horizontal = parse_layout(sys.argv[1])
|
||||
if len(sys.argv) == 2:
|
||||
plot_lines1(vertical, horizontal)
|
||||
elif len(sys.argv) == 3:
|
||||
plot_lines1(vertical, horizontal)
|
||||
plot_lines2(sys.argv[2], vertical, horizontal)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
"""
|
||||
usage: python plot_intensity.py file.png threshold
|
||||
|
||||
plots sum of pixel intensities on both axes for an image.
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from pylab import barh
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def plot_barchart(ar):
|
||||
n = len(ar)
|
||||
ind = np.arange(n)
|
||||
width = 0.35
|
||||
plt.bar(ind, ar, width, color='r', zorder=1)
|
||||
plt.show()
|
||||
|
||||
|
||||
def merge_lines(lines):
|
||||
ranges = []
|
||||
for k, g in groupby(enumerate(lines), lambda (i, x): i-x):
|
||||
group = map(itemgetter(1), g)
|
||||
ranges.append((group[0], group[-1]))
|
||||
merged = []
|
||||
for r in ranges:
|
||||
merged.append((r[0] + r[1]) / 2)
|
||||
return merged
|
||||
|
||||
|
||||
def plot_lines(image, lines):
|
||||
for y in lines:
|
||||
plt.plot([0, image.shape[1]], [y, y])
|
||||
plt.imshow(image)
|
||||
plt.show()
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
image = cv2.imread(sys.argv[1])
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
||||
y_proj = np.sum(threshold, axis=1)
|
||||
line_threshold = int(sys.argv[2])
|
||||
lines = np.where(y_proj < line_threshold)[0]
|
||||
lines = merge_lines(lines)
|
||||
plot_lines(image, lines)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
"""
|
||||
usage: python print_text.py file.pdf
|
||||
|
||||
prints horizontal and vertical text lines present in a pdf file.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pprint import pprint
|
||||
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter
|
||||
from pdfminer.pdfinterp import PDFResourceManager
|
||||
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
||||
from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
|
||||
LTTextLineHorizontal, LTTextLineVertical, LTLine)
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
def extract_text_objects(layout, LTObject, t=None):
|
||||
if t is None:
|
||||
t = []
|
||||
try:
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, LTObject):
|
||||
t.append(obj)
|
||||
else:
|
||||
t += extract_text_objects(obj, LTObject)
|
||||
except AttributeError:
|
||||
pass
|
||||
return t
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
with open(sys.argv[1], 'rb') as f:
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
# 2.0, 0.5, 0.1
|
||||
kwargs = {
|
||||
'char_margin': 1.0,
|
||||
'line_margin': 0.5,
|
||||
'word_margin': 0.1,
|
||||
'detect_vertical': True
|
||||
}
|
||||
laparams = LAParams(**kwargs)
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.create_pages(document):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
lh = extract_text_objects(layout, LTTextLineHorizontal)
|
||||
lv = extract_text_objects(layout, LTTextLineVertical)
|
||||
print "number of horizontal text lines -> {0}".format(len(lh))
|
||||
print "horizontal text lines ->"
|
||||
pprint([t.get_text() for t in lh])
|
||||
print "number of vertical text lines -> {0}".format(len(lv))
|
||||
print "vertical text lines ->"
|
||||
pprint([t.get_text() for t in lv])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
"""
|
||||
usage: python threshold.py file.png blocksize threshold_constant
|
||||
|
||||
shows thresholded image.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def timeit(func):
|
||||
def timed(*args, **kw):
|
||||
start = time.time()
|
||||
result = func(*args, **kw)
|
||||
end = time.time()
|
||||
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||
return result
|
||||
return timed
|
||||
|
||||
|
||||
@timeit
|
||||
def main():
|
||||
img = cv2.imread(sys.argv[1])
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
blocksize = int(sys.argv[2])
|
||||
threshold_constant = float(sys.argv[3])
|
||||
threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant)
|
||||
plt.imshow(img)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
print __doc__
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/column_span_1.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/column_span_2.pdf"), clean=True, scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(
|
||||
Pdf("files/row_span_1.pdf", clean=True), fill='v', scale=40)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(
|
||||
Pdf("files/row_span_2.pdf", clean=True), fill='v', scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/lines_in_background_1.pdf",
|
||||
clean=True), scale=30, invert=True)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/lines_in_background_2.pdf",
|
||||
clean=True), scale=30, invert=True)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/left_rotated_table.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/right_rotated_table.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Lattice
|
||||
|
||||
|
||||
extractor = Lattice(Pdf("files/twotables_1.pdf", clean=True), scale=40)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Lattice(Pdf("files/twotables_2.pdf", clean=True), scale=30)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Stream
|
||||
|
||||
|
||||
extractor = Stream(Pdf("files/budget_2014-15.pdf",
|
||||
char_margin=1.0, clean=True))
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from camelot import Pdf
|
||||
from camelot import Stream
|
||||
|
||||
|
||||
extractor = Stream(Pdf("files/inconsistent_rows.pdf", char_margin=1.0),
|
||||
columns="65,95,285,640,715,780", ytol=10)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
|
||||
extractor = Stream(Pdf("files/consistent_rows.pdf", char_margin=1.0),
|
||||
columns="28,67,180,230,425,475,700", ytol=5)
|
||||
tables = extractor.get_tables()
|
||||
print tables
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue