camelot-py/debug/plot_geo.py

"""
usage:  python plot_geo.py file.pdf
        python plot_geo.py file.pdf file.png

prints lines and rectangles present in a pdf file.
"""

import sys
import time

import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams, LTLine, LTRect
from pdfminer.pdfpage import PDFTextExtractionNotAllowed


MIN_LENGTH = 1
pdf_x, pdf_y, image_x, image_y = [0] * 4


def timeit(func):
    def timed(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
        return result
    return timed


def remove_coords(coords):
    merged = []
    for coord in coords:
        if not merged:
            merged.append(coord)
        else:
            last = merged[-1]
            if np.isclose(last, coord, atol=2):
                pass
            else:
                merged.append(coord)
    return merged


def parse_layout(pdfname):
    global pdf_x, pdf_y
    def is_horizontal(line):
        if line[0] == line[2]:
            return True
        return False

    def is_vertical(line):
        if line[1] == line[3]:
            return True
        return False

    vertical, horizontal = [], []
    with open(pdfname, 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
            for obj in layout._objs:
                if isinstance(obj, LTLine):
                    line = (obj.x0, obj.y0, obj.x1, obj.y1)
                    if is_vertical(line):
                        vertical.append(line)
                    elif is_horizontal(line):
                        horizontal.append(line)
                elif isinstance(obj, LTRect):
                    vertical.append((obj.x0, obj.y1, obj.x0, obj.y0))
                    vertical.append((obj.x1, obj.y1, obj.x1, obj.y0))
                    horizontal.append((obj.x0, obj.y1, obj.x1, obj.y1))
                    horizontal.append((obj.x0, obj.y0, obj.x1, obj.y0))
    return vertical, horizontal


def hough_transform(imagename):
    global pdf_x, pdf_y, image_x, image_y
    img = cv2.imread(imagename)
    image_x, image_y = img.shape[1], img.shape[0]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi/180, 1000)
    x = []
    for line in lines:
        r, theta = line[0]
        x0 = r * np.cos(theta)
        x0 *= pdf_x / float(image_x)
        x.append(x0)
    y = []
    for line in lines:
        r, theta = line[0]
        y0 = r * np.sin(theta)
        y0 = abs(y0 - image_y)
        y0 *= pdf_y / float(image_y)
        y.append(y0)
    x = remove_coords(sorted(set([x0 for x0 in x if x0 > 0])))
    y = remove_coords(sorted(set(y), reverse=True))
    return x, y


def plot_lines1(vertical, horizontal):
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect='equal')
    ax.set_xlim(0, 1000)
    ax.set_ylim(0, 1000)

    vertical = filter(lambda x: abs(x[1] - x[3]) > MIN_LENGTH, vertical)
    horizontal = filter(lambda x: abs(x[0] - x[2]) > MIN_LENGTH, horizontal)
    for v in vertical:
        ax.plot([v[0], v[2]], [v[1], v[3]])
    for h in horizontal:
        ax.plot([h[0], h[2]], [h[1], h[3]])
    plt.show()


def plot_lines2(imagename, vertical, horizontal):
    x, y = hough_transform(imagename)
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect='equal')
    ax.set_xlim(0, 1000)
    ax.set_ylim(0, 1000)

    for x0 in x:
        for v in vertical:
            if np.isclose(x0, v[0], atol=2):
                ax.plot([v[0], v[2]], [v[1], v[3]])
    for y0 in y:
        for h in horizontal:
            if np.isclose(y0, h[1], atol=2):
                ax.plot([h[0], h[2]], [h[1], h[3]])
    plt.show()


@timeit
def main():
    vertical, horizontal = parse_layout(sys.argv[1])
    if len(sys.argv) == 2:
        plot_lines1(vertical, horizontal)
    elif len(sys.argv) == 3:
        plot_lines1(vertical, horizontal)
        plot_lines2(sys.argv[2], vertical, horizontal)


if __name__ == '__main__':
    if len(sys.argv) == 1:
        print __doc__
    else:
        main()