Add deepcopy and debug scripts

2017-04-10 18:59:48 +05:30 · 2017-04-10 18:59:48 +05:30 · 84d354ba10
parent 4dd0d2330e
commit 84d354ba10
11 changed files with 568 additions and 21 deletions
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@ -2,7 +2,7 @@ import cv2
 import numpy as np
-def adaptive_threshold(imagename, invert=False):
+def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
@ -15,6 +15,15 @@ def adaptive_threshold(imagename, invert=False):
        tables with lines in background.
        (optional, default: False)
    blocksize: int
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
    c: float
        Constant subtracted from the mean or weighted mean
        (see the details below). Normally, it is positive but may be
        zero or negative as well.
    Returns
    -------
    img : object
@ -27,14 +36,11 @@ def adaptive_threshold(imagename, invert=False):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if invert:
-        threshold = cv2.adaptiveThreshold(
+        threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
+            cv2.THRESH_BINARY, blocksize, c)
            15, -0.2)
    else:
-        threshold = cv2.adaptiveThreshold(
+        threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
-            np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c)
            cv2.THRESH_BINARY,
            15, -0.2)
    return img, threshold
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -1,6 +1,7 @@
 from __future__ import division
 import os
 import sys
 import copy
 import types
 import logging
 import copy_reg
@ -269,7 +270,9 @@ class Lattice:
            table_bbox = find_table_joints(contours, vmask, hmask)
        if len(self.mtol) == 1 and self.mtol[0] == 2:
-            mtolerance = self.mtol * len(table_bbox)
+            mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
        else:
            mtolerance = copy.deepcopy(self.mtol)
        if self.debug:
            self.debug_images = (img, table_bbox)
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -1,4 +1,5 @@
 import os
 import copy
 import subprocess
 import pyocr
@ -100,7 +101,9 @@ class OCR:
            self.debug_tables = []
        if len(self.mtol) == 1 and self.mtol[0] == 2:
-            self.mtol = self.mtol * len(table_bbox)
+            mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
        else:
            mtolerance = copy.deepcopy(self.mtol)
        page = {}
        tables = {}
@ -111,8 +114,8 @@ class OCR:
            cols, rows = list(cols), list(rows)
            cols.extend([k[0], k[2]])
            rows.extend([k[1], k[3]])
-            cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
+            cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
-            rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol[table_no])
+            rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
            cols = [(cols[i], cols[i + 1])
                    for i in range(0, len(cols) - 1)]
            rows = [(rows[i], rows[i + 1])
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -1,5 +1,6 @@
 from __future__ import division
 import os
 import copy
 import types
 import logging
 import copy_reg
@ -332,9 +333,13 @@ class Stream:
            table_bbox = {(0, 0, width, height): None}
        if len(self.ytol) == 1 and self.ytol[0] == 2:
-            ytolerance = self.ytol * len(table_bbox)
+            ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
        else:
            ytolerance = copy.deepcopy(self.ytol)
        if len(self.mtol) == 1 and self.mtol[0] == 0:
-            mtolerance = self.mtol * len(table_bbox)
+            mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
        else:
            mtolerance = copy.deepcopy(self.mtol)
        page = {}
        tables = {}
--- a/debug/camelot_scripts/hough_opencv.py
+++ b/debug/camelot_scripts/hough_opencv.py
@ -0,0 +1,53 @@
 """
 usage: python hough_opencv.py file.png
 find lines present in an image using opencv's hough transform.
 """
 import sys
 import time
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
@timeit
 def main():
    image = cv2.imread(sys.argv[1])
    print "image dimensions -> {0}".format(image.shape)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
    print "found {0} lines".format(len(lines))
    for line in lines:
        r, theta = line[0]
        # filter horizontal and vertical lines
        if theta == 0 or np.isclose(theta, np.pi / 2):
            x0 = r * np.cos(theta)
            y0 = r * np.sin(theta)
            x1 = int(x0 + 10000 * (-np.sin(theta)))
            y1 = int(y0 + 10000 * (np.cos(theta)))
            x2 = int(x0 - 10000 * (-np.sin(theta)))
            y2 = int(y0 - 10000 * (np.cos(theta)))
            cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 5)
    plt.imshow(image)
    plt.show()
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()
--- a/debug/camelot_scripts/hough_skimage.py
+++ b/debug/camelot_scripts/hough_skimage.py
@ -0,0 +1,75 @@
 """
 usage: python hough_skimage.py file.png
 find lines present in an image using scikit-image's hough transform.
 """
 import sys
 import time
 import cv2
 import numpy as np
 from scipy.misc import imread
 import matplotlib.pyplot as plt
 from skimage.transform import hough_line, hough_line_peaks
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
@timeit
 def main():
    image = cv2.imread(sys.argv[1])
    print "image dimensions -> {0}".format(image.shape)
    ret, binary = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
    binary = np.min(binary, axis=2)
    binary = np.where(binary == 255, 0, 255)
    rows, cols = binary.shape
    pixel = np.zeros(binary.shape)
    fig, ax = plt.subplots(1, 1, figsize=(8,4))
    ax.imshow(image, cmap=plt.cm.gray)
    theta_in = np.linspace(0, np.pi / 2, 10)
    h, theta, d = hough_line(binary, theta_in)
    for _, angle, dist in zip(*hough_line_peaks(h, theta, d)):
        x0 = dist * np.cos(angle)
        y0 = dist * np.sin(angle)
        x1 = int(x0 + 1000 * (-np.sin(angle)))
        y1 = int(y0 + 1000 * (np.cos(angle)))
        x2 = int(x0 - 1000 * (-np.sin(angle)))
        y2 = int(y0 - 1000 * (np.cos(angle)))
        ax.plot((x1, x2), (y1, y2), '-r')
        a = np.cos(angle)
        b = np.sin(angle)
        x = np.arange(binary.shape[1])
        y = np.arange(binary.shape[0])
        x = a * x
        y = b * y
        R = np.round(np.add(y.reshape((binary.shape[0], 1)), x.reshape((1, binary.shape[1]))))
        pixel += np.isclose(R, np.round(dist))
    pixel = np.clip(pixel, 0, 1)
    pixel = np.where(pixel == 1, 0, 1)
    binary = np.where(binary == 0, 255, 0)
    binary *= pixel.astype(np.int64)
    ax.imshow(binary, cmap=plt.cm.gray)
    ax.axis((0, cols, rows, 0))
    ax.set_title('Detected lines')
    ax.set_axis_off()
    ax.set_adjustable('box-forced')
    plt.show()
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()
--- a/debug/camelot_scripts/houghp_skimage.py
+++ b/debug/camelot_scripts/houghp_skimage.py
@ -0,0 +1,49 @@
 """
 usage: python hough_prob.py file.png
 find lines present in an image using scikit-image's hough transform.
 """
 import sys
 import time
 from scipy.misc import imread
 import matplotlib.pyplot as plt
 from skimage.feature import canny
 from skimage.transform import probabilistic_hough_line
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
@timeit
 def main():
    image = imread(sys.argv[1], mode='L')
    edges = canny(image, 2, 1, 25)
    lines = probabilistic_hough_line(edges, threshold=1000)
    fig, ax = plt.subplots(1, 1, figsize=(8,4), sharex=True, sharey=True)
    ax.imshow(edges * 0)
    for line in lines:
        p0, p1 = line
        ax.plot((p0[0], p1[0]), (p0[1], p1[1]))
    ax.set_title('Probabilistic Hough')
    ax.set_axis_off()
    ax.set_adjustable('box-forced')
    plt.show()
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()
--- a/debug/camelot_scripts/morph_transform.py
+++ b/debug/camelot_scripts/morph_transform.py
@ -0,0 +1,103 @@
 """
 usage: python morph_transform.py file.png
 find lines present in an image using opencv's morph transform.
 """
 import sys
 import time
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
 def mt(imagename, scale=40):
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
    vertical = threshold
    horizontal = threshold
    verticalsize = vertical.shape[0] / scale
    horizontalsize = horizontal.shape[1] / scale
    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
    vertical = cv2.erode(vertical, ver, (-1, -1))
    vertical = cv2.dilate(vertical, ver, (-1, -1))
    horizontal = cv2.erode(horizontal, hor, (-1, -1))
    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    tables = {}
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        # find number of non-zero values in joints using what boundingRect returns
        roi = joints[y:y+h, x:x+w]
        jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4: # remove contours with less than <=4 joints
            continue
        joint_coords = []
        for j in jc:
            jx, jy, jw, jh = cv2.boundingRect(j)
            c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
            joint_coords.append((c1, c2))
        tables[(x1, y2, x2, y1)] = joint_coords
    vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        plt.plot([(x1 + x2) / 2, (x1 + x2) / 2], [y2, y1])
    hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        plt.plot([x1, x2], [(y1 + y2) / 2, (y1 + y2) / 2])
    x_coord = []
    y_coord = []
    for k in tables.keys():
        for coord in tables[k]:
            x_coord.append(coord[0])
            y_coord.append(coord[1])
    plt.plot(x_coord, y_coord, 'ro')
    plt.imshow(img)
    plt.show()
    return tables
@timeit
 def main():
    t = mt(sys.argv[1])
    print 'tables found: ', len(t.keys())
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()
--- a/debug/camelot_scripts/plot_geo.py
+++ b/debug/camelot_scripts/plot_geo.py
@ -0,0 +1,167 @@
 """
 usage:  python plot_geo.py file.pdf
        python plot_geo.py file.pdf file.png
 print lines and rectangles present in a pdf file.
 """
 import sys
 import time
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.layout import LAParams, LTLine, LTRect
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 MIN_LENGTH = 1
 pdf_x, pdf_y, image_x, image_y = [0] * 4
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
 def remove_coords(coords):
    merged = []
    for coord in coords:
        if not merged:
            merged.append(coord)
        else:
            last = merged[-1]
            if np.isclose(last, coord, atol=2):
                pass
            else:
                merged.append(coord)
    return merged
 def parse_layout(pdfname):
    global pdf_x, pdf_y
    def is_horizontal(line):
        if line[0] == line[2]:
            return True
        return False
    def is_vertical(line):
        if line[1] == line[3]:
            return True
        return False
    vertical, horizontal = [], []
    with open(pdfname, 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
            for obj in layout._objs:
                if isinstance(obj, LTLine):
                    line = (obj.x0, obj.y0, obj.x1, obj.y1)
                    if is_vertical(line):
                        vertical.append(line)
                    elif is_horizontal(line):
                        horizontal.append(line)
                elif isinstance(obj, LTRect):
                    vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
                    vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
                    horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
                    horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
    return vertical, horizontal
 def hough_transform(imagename):
    global pdf_x, pdf_y, image_x, image_y
    img = cv2.imread(imagename)
    image_x, image_y = img.shape[1], img.shape[0]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
    x = []
    for line in lines:
        r, theta = line[0]
        x0 = r * np.cos(theta)
        x0 *= pdf_x / float(image_x)
        x.append(x0)
    y = []
    for line in lines:
        r, theta = line[0]
        y0 = r * np.sin(theta)
        y0 = abs(y0 - image_y)
        y0 *= pdf_y / float(image_y)
        y.append(y0)
    x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
    y = remove_coords(sorted(set(y), reverse=True))
    return x, y
 def plot_lines1(vertical, horizontal):
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect='equal')
    ax.set_xlim(0, 1000)
    ax.set_ylim(0, 1000)
    vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
    horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
    for v in vertical:
        ax.plot([v[0], v[2]], [v[1], v[3]])
    for h in horizontal:
        ax.plot([h[0], h[2]], [h[1], h[3]])
    plt.show()
 def plot_lines2(imagename, vertical, horizontal):
    x, y = hough_transform(imagename)
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect='equal')
    ax.set_xlim(0, 1000)
    ax.set_ylim(0, 1000)
    for x0 in x:
        for v in vertical:
            if np.isclose(x0, v[0], atol=2):
                ax.plot([v[0], v[2]], [v[1], v[3]])
    for y0 in y:
        for h in horizontal:
            if np.isclose(y0, h[1], atol=2):
                ax.plot([h[0], h[2]], [h[1], h[3]])
    plt.show()
@timeit
 def main():
    vertical, horizontal = parse_layout(sys.argv[1])
    if len(sys.argv) == 2:
        plot_lines1(vertical, horizontal)
    elif len(sys.argv) == 3:
        plot_lines1(vertical, horizontal)
        plot_lines2(sys.argv[2], vertical, horizontal)
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()
--- a/debug/camelot_scripts/print_text.py
+++ b/debug/camelot_scripts/print_text.py
@ -0,0 +1,83 @@
 """
 usage: python print_text.py file.pdf
 prints horizontal and vertical text lines present in a pdf file.
 """
 import sys
 import time
 from pprint import pprint
 from pdfminer.layout import LAParams
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.layout import (LAParams, LTChar, LTAnno, LTTextBoxHorizontal,
                             LTTextLineHorizontal, LTTextLineVertical, LTLine)
 def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed
 def extract_text_objects(layout, LTObject, t=None):
    if t is None:
        t = []
    try:
        for obj in layout._objs:
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
                t += extract_text_objects(obj, LTObject)
    except AttributeError:
        pass
    return t
@timeit
 def main():
    with open(sys.argv[1], 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 2.0, 0.5, 0.1
        kwargs = {
            'char_margin': 1.0,
            'line_margin': 0.5,
            'word_margin': 0.1,
            'detect_vertical': True
        }
        laparams = LAParams(**kwargs)
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            lh = extract_text_objects(layout, LTTextLineHorizontal)
            lv = extract_text_objects(layout, LTTextLineVertical)
            print "number of horizontal text lines -> {0}".format(len(lh))
            print "horizontal text lines ->"
            pprint([t.get_text() for t in lh])
            print "number of vertical text lines -> {0}".format(len(lv))
            print "vertical text lines ->"
            pprint([t.get_text() for t in lv])
 if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()