diff --git a/camelot/imgproc.py b/camelot/imgproc.py index 363f59f..eba296b 100644 --- a/camelot/imgproc.py +++ b/camelot/imgproc.py @@ -2,7 +2,7 @@ import cv2 import numpy as np -def adaptive_threshold(imagename, invert=False): +def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): """Thresholds an image using OpenCV's adaptiveThreshold. Parameters @@ -15,6 +15,15 @@ def adaptive_threshold(imagename, invert=False): tables with lines in background. (optional, default: False) + blocksize: int + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + + c: float + Constant subtracted from the mean or weighted mean + (see the details below). Normally, it is positive but may be + zero or negative as well. + Returns ------- img : object @@ -27,14 +36,11 @@ def adaptive_threshold(imagename, invert=False): gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if invert: - threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, - 15, -0.2) + threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, blocksize, c) else: - threshold = cv2.adaptiveThreshold( - np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, - 15, -0.2) + threshold = cv2.adaptiveThreshold(np.invert(gray), 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) return img, threshold @@ -137,7 +143,7 @@ def find_table_contours(vertical, horizontal): x, y, w, h = cv2.boundingRect(c_poly) cont.append((x, y, w, h)) return cont - + def find_table_joints(contours, vertical, horizontal): """Finds joints/intersections present inside each table boundary. diff --git a/camelot/lattice.py b/camelot/lattice.py index f28c09e..45d15ee 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -1,6 +1,7 @@ from __future__ import division import os import sys +import copy import types import logging import copy_reg @@ -269,7 +270,9 @@ class Lattice: table_bbox = find_table_joints(contours, vmask, hmask) if len(self.mtol) == 1 and self.mtol[0] == 2: - mtolerance = self.mtol * len(table_bbox) + mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) + else: + mtolerance = copy.deepcopy(self.mtol) if self.debug: self.debug_images = (img, table_bbox) diff --git a/camelot/ocr.py b/camelot/ocr.py index 57ddb54..16c6631 100644 --- a/camelot/ocr.py +++ b/camelot/ocr.py @@ -1,4 +1,5 @@ import os +import copy import subprocess import pyocr @@ -100,7 +101,9 @@ class OCR: self.debug_tables = [] if len(self.mtol) == 1 and self.mtol[0] == 2: - self.mtol = self.mtol * len(table_bbox) + mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) + else: + mtolerance = copy.deepcopy(self.mtol) page = {} tables = {} @@ -111,8 +114,8 @@ class OCR: cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) rows.extend([k[1], k[3]]) - cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no]) - rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no]) + cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) + rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no]) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) diff --git a/camelot/stream.py b/camelot/stream.py index e272421..5195ba8 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -1,5 +1,6 @@ from __future__ import division import os +import copy import types import logging import copy_reg @@ -332,9 +333,13 @@ class Stream: table_bbox = {(0, 0, width, height): None} if len(self.ytol) == 1 and self.ytol[0] == 2: - ytolerance = self.ytol * len(table_bbox) + ytolerance = copy.deepcopy(self.ytol) * len(table_bbox) + else: + ytolerance = copy.deepcopy(self.ytol) if len(self.mtol) == 1 and self.mtol[0] == 0: - mtolerance = self.mtol * len(table_bbox) + mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) + else: + mtolerance = copy.deepcopy(self.mtol) page = {} tables = {} diff --git a/debug/camelot_scripts/hough_opencv.py b/debug/camelot_scripts/hough_opencv.py new file mode 100644 index 0000000..619dda8 --- /dev/null +++ b/debug/camelot_scripts/hough_opencv.py @@ -0,0 +1,53 @@ +""" +usage: python hough_opencv.py file.png + +find lines present in an image using opencv's hough transform. +""" + +import sys +import time + +import cv2 +import numpy as np +import matplotlib.pyplot as plt + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +@timeit +def main(): + image = cv2.imread(sys.argv[1]) + print "image dimensions -> {0}".format(image.shape) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + + lines = cv2.HoughLines(edges, 1, np.pi / 180, 200) + print "found {0} lines".format(len(lines)) + for line in lines: + r, theta = line[0] + # filter horizontal and vertical lines + if theta == 0 or np.isclose(theta, np.pi / 2): + x0 = r * np.cos(theta) + y0 = r * np.sin(theta) + x1 = int(x0 + 10000 * (-np.sin(theta))) + y1 = int(y0 + 10000 * (np.cos(theta))) + x2 = int(x0 - 10000 * (-np.sin(theta))) + y2 = int(y0 - 10000 * (np.cos(theta))) + cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5) + plt.imshow(image) + plt.show() + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/debug/camelot_scripts/hough_skimage.py b/debug/camelot_scripts/hough_skimage.py new file mode 100644 index 0000000..de0f67e --- /dev/null +++ b/debug/camelot_scripts/hough_skimage.py @@ -0,0 +1,75 @@ +""" +usage: python hough_skimage.py file.png + +find lines present in an image using scikit-image's hough transform. +""" + +import sys +import time + +import cv2 +import numpy as np +from scipy.misc import imread +import matplotlib.pyplot as plt +from skimage.transform import hough_line, hough_line_peaks + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +@timeit +def main(): + image = cv2.imread(sys.argv[1]) + print "image dimensions -> {0}".format(image.shape) + ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY) + binary = np.min(binary, axis=2) + binary = np.where(binary == 255, 0, 255) + rows, cols = binary.shape + pixel = np.zeros(binary.shape) + + fig, ax = plt.subplots(1, 1, figsize=(8,4)) + ax.imshow(image, cmap=plt.cm.gray) + + theta_in = np.linspace(0, np.pi / 2, 10) + h, theta, d = hough_line(binary, theta_in) + for _, angle, dist in zip(*hough_line_peaks(h, theta, d)): + x0 = dist * np.cos(angle) + y0 = dist * np.sin(angle) + x1 = int(x0 + 1000 * (-np.sin(angle))) + y1 = int(y0 + 1000 * (np.cos(angle))) + x2 = int(x0 - 1000 * (-np.sin(angle))) + y2 = int(y0 - 1000 * (np.cos(angle))) + ax.plot((x1, x2), (y1, y2), '-r') + a = np.cos(angle) + b = np.sin(angle) + x = np.arange(binary.shape[1]) + y = np.arange(binary.shape[0]) + x = a * x + y = b * y + R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1])))) + pixel += np.isclose(R, np.round(dist)) + + pixel = np.clip(pixel, 0, 1) + pixel = np.where(pixel == 1, 0, 1) + binary = np.where(binary == 0, 255, 0) + binary *= pixel.astype(np.int64) + ax.imshow(binary, cmap=plt.cm.gray) + ax.axis((0, cols, rows, 0)) + ax.set_title('Detected lines') + ax.set_axis_off() + ax.set_adjustable('box-forced') + plt.show() + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/debug/camelot_scripts/houghp_skimage.py b/debug/camelot_scripts/houghp_skimage.py new file mode 100644 index 0000000..b1680d2 --- /dev/null +++ b/debug/camelot_scripts/houghp_skimage.py @@ -0,0 +1,49 @@ +""" +usage: python hough_prob.py file.png + +find lines present in an image using scikit-image's hough transform. +""" + +import sys +import time + +from scipy.misc import imread +import matplotlib.pyplot as plt +from skimage.feature import canny +from skimage.transform import probabilistic_hough_line + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +@timeit +def main(): + image = imread(sys.argv[1], mode='L') + edges = canny(image, 2, 1, 25) + lines = probabilistic_hough_line(edges, threshold=1000) + + fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True) + ax.imshow(edges * 0) + + for line in lines: + p0, p1 = line + ax.plot((p0[0], p1[0]), (p0[1], p1[1])) + + ax.set_title('Probabilistic Hough') + ax.set_axis_off() + ax.set_adjustable('box-forced') + plt.show() + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/debug/camelot_scripts/morph_transform.py b/debug/camelot_scripts/morph_transform.py new file mode 100644 index 0000000..98c6563 --- /dev/null +++ b/debug/camelot_scripts/morph_transform.py @@ -0,0 +1,103 @@ +""" +usage: python morph_transform.py file.png + +find lines present in an image using opencv's morph transform. +""" + +import sys +import time + +import cv2 +import numpy as np +import matplotlib.pyplot as plt + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +def mt(imagename, scale=40): + img = cv2.imread(imagename) + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2) + vertical = threshold + horizontal = threshold + + verticalsize = vertical.shape[0] / scale + horizontalsize = horizontal.shape[1] / scale + + ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) + hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) + + vertical = cv2.erode(vertical, ver, (-1, -1)) + vertical = cv2.dilate(vertical, ver, (-1, -1)) + + horizontal = cv2.erode(horizontal, hor, (-1, -1)) + horizontal = cv2.dilate(horizontal, hor, (-1, -1)) + + mask = vertical + horizontal + joints = np.bitwise_and(vertical, horizontal) + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + + tables = {} + for c in contours: + x, y, w, h = cv2.boundingRect(c) + x1, x2 = x, x + w + y1, y2 = y, y + h + # find number of non-zero values in joints using what boundingRect returns + roi = joints[y:y+h, x:x+w] + jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + if len(jc) <= 4: # remove contours with less than <=4 joints + continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2 + joint_coords.append((c1, c2)) + tables[(x1, y2, x2, y1)] = joint_coords + + vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for vc in vcontours: + x, y, w, h = cv2.boundingRect(vc) + x1, x2 = x, x + w + y1, y2 = y, y + h + plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1]) + + hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for hc in hcontours: + x, y, w, h = cv2.boundingRect(hc) + x1, x2 = x, x + w + y1, y2 = y, y + h + plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2]) + + x_coord = [] + y_coord = [] + for k in tables.keys(): + for coord in tables[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + plt.plot(x_coord, y_coord, 'ro') + + plt.imshow(img) + plt.show() + return tables + + +@timeit +def main(): + t = mt(sys.argv[1]) + print 'tables found: ', len(t.keys()) + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/debug/camelot_scripts/plot_geo.py b/debug/camelot_scripts/plot_geo.py new file mode 100644 index 0000000..a0216b4 --- /dev/null +++ b/debug/camelot_scripts/plot_geo.py @@ -0,0 +1,167 @@ +""" +usage: python plot_geo.py file.pdf + python plot_geo.py file.pdf file.png + +print lines and rectangles present in a pdf file. +""" + +import sys +import time + +import cv2 +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.converter import PDFPageAggregator +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.layout import LAParams, LTLine, LTRect +from pdfminer.pdfpage import PDFTextExtractionNotAllowed + + +MIN_LENGTH = 1 +pdf_x, pdf_y, image_x, image_y = [0] * 4 + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +def remove_coords(coords): + merged = [] + for coord in coords: + if not merged: + merged.append(coord) + else: + last = merged[-1] + if np.isclose(last, coord, atol=2): + pass + else: + merged.append(coord) + return merged + + +def parse_layout(pdfname): + global pdf_x, pdf_y + def is_horizontal(line): + if line[0] == line[2]: + return True + return False + + def is_vertical(line): + if line[1] == line[3]: + return True + return False + + vertical, horizontal = [], [] + with open(pdfname, 'rb') as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + laparams = LAParams() + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] + for obj in layout._objs: + if isinstance(obj, LTLine): + line = (obj.x0, obj.y0, obj.x1, obj.y1) + if is_vertical(line): + vertical.append(line) + elif is_horizontal(line): + horizontal.append(line) + elif isinstance(obj, LTRect): + vertical.append((obj.x0, obj.y1, obj.x0, obj.y0)) + vertical.append((obj.x1, obj.y1, obj.x1, obj.y0)) + horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1)) + horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0)) + return vertical, horizontal + + +def hough_transform(imagename): + global pdf_x, pdf_y, image_x, image_y + img = cv2.imread(imagename) + image_x, image_y = img.shape[1], img.shape[0] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + lines = cv2.HoughLines(edges, 1, np.pi/180, 1000) + x = [] + for line in lines: + r, theta = line[0] + x0 = r * np.cos(theta) + x0 *= pdf_x / float(image_x) + x.append(x0) + y = [] + for line in lines: + r, theta = line[0] + y0 = r * np.sin(theta) + y0 = abs(y0 - image_y) + y0 *= pdf_y / float(image_y) + y.append(y0) + x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0]))) + y = remove_coords(sorted(set(y), reverse=True)) + return x, y + + +def plot_lines1(vertical, horizontal): + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + ax.set_xlim(0, 1000) + ax.set_ylim(0, 1000) + + vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical) + horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal) + for v in vertical: + ax.plot([v[0], v[2]], [v[1], v[3]]) + for h in horizontal: + ax.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() + + +def plot_lines2(imagename, vertical, horizontal): + x, y = hough_transform(imagename) + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + ax.set_xlim(0, 1000) + ax.set_ylim(0, 1000) + + for x0 in x: + for v in vertical: + if np.isclose(x0, v[0], atol=2): + ax.plot([v[0], v[2]], [v[1], v[3]]) + for y0 in y: + for h in horizontal: + if np.isclose(y0, h[1], atol=2): + ax.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() + + +@timeit +def main(): + vertical, horizontal = parse_layout(sys.argv[1]) + if len(sys.argv) == 2: + plot_lines1(vertical, horizontal) + elif len(sys.argv) == 3: + plot_lines1(vertical, horizontal) + plot_lines2(sys.argv[2], vertical, horizontal) + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/debug/camelot_scripts/print_text.py b/debug/camelot_scripts/print_text.py new file mode 100644 index 0000000..1ab83d2 --- /dev/null +++ b/debug/camelot_scripts/print_text.py @@ -0,0 +1,83 @@ +""" +usage: python print_text.py file.pdf + +prints horizontal and vertical text lines present in a pdf file. +""" + +import sys +import time +from pprint import pprint + +from pdfminer.layout import LAParams +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.converter import PDFPageAggregator +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal, + LTTextLineHorizontal, LTTextLineVertical, LTLine) + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +def extract_text_objects(layout, LTObject, t=None): + if t is None: + t = [] + try: + for obj in layout._objs: + if isinstance(obj, LTObject): + t.append(obj) + else: + t += extract_text_objects(obj, LTObject) + except AttributeError: + pass + return t + + +@timeit +def main(): + with open(sys.argv[1], 'rb') as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + # 2.0, 0.5, 0.1 + kwargs = { + 'char_margin': 1.0, + 'line_margin': 0.5, + 'word_margin': 0.1, + 'detect_vertical': True + } + laparams = LAParams(**kwargs) + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + lh = extract_text_objects(layout, LTTextLineHorizontal) + lv = extract_text_objects(layout, LTTextLineVertical) + print "number of horizontal text lines -> {0}".format(len(lh)) + print "horizontal text lines ->" + pprint([t.get_text() for t in lh]) + print "number of vertical text lines -> {0}".format(len(lv)) + print "vertical text lines ->" + pprint([t.get_text() for t in lv]) + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index 91ddfe9..d282e96 100755 --- a/tools/camelot +++ b/tools/camelot @@ -389,7 +389,7 @@ if __name__ == '__main__': parallel=args['--parallel'], clean=True) data = manager.extract() - + processing_time = time.time() - start_time logger.info("Finished processing in " + str(processing_time) + " seconds") @@ -413,7 +413,7 @@ if __name__ == '__main__': if 'rc' in plot_type: plot_rc_piechart(data, pngname) - + if args['--print-stats']: print_stats(data, processing_time) @@ -455,10 +455,10 @@ if __name__ == '__main__': parallel=args['--parallel'], clean=True) data = manager.extract() - + processing_time = time.time() - start_time logger.info("Finished processing in " + str(processing_time) + " seconds") - + if args['--plot']: if args['--output']: pngname = os.path.join(args['--output'], os.path.basename(pngname)) @@ -482,7 +482,7 @@ if __name__ == '__main__': if args['--print-stats']: print_stats(data, processing_time) - + if args['--save-stats']: if args['--output']: scorename = os.path.join(args['--output'], os.path.basename(scorename)) @@ -498,7 +498,7 @@ if __name__ == '__main__': table['ncols'], table['empty_p'], table['score'])) - + if args['--debug']: manager.debug_plot() except Exception as e: