camelot-py/morph_transform.py

import cv2
import sys
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar

def transform(x, y, img_x, img_y, pdf_x, pdf_y):
	x *= pdf_x / float(img_x)
	y = abs(y - img_y)
	y *= pdf_y / float(img_y)
	return x, y

# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
def morph(imagename, p_x, p_y, s):
	img = cv2.imread(imagename)
	img_x, img_y = img.shape[1], img.shape[0]
	pdf_x, pdf_y = p_x, p_y
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
	vertical = th1
	horizontal = th1

	scale = s
	verticalsize = vertical.shape[0] / scale
	horizontalsize = horizontal.shape[1] / scale

	ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
	hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))

	vertical = cv2.erode(vertical, ver, (-1, -1))
	vertical = cv2.dilate(vertical, ver, (-1, -1))

	horizontal = cv2.erode(horizontal, hor, (-1, -1))
	horizontal = cv2.dilate(horizontal, hor, (-1, -1))

	mask = vertical + horizontal
	joints = np.bitwise_and(vertical, horizontal)
	_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

	tables = {}
	for c in contours:
		x, y, w, h = cv2.boundingRect(c)
		jmask = joints[y:y+h, x:x+w]
		_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)

		if len(jc) <= 4: # remove contours with less than <=4 joints
			continue
		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
		tables[(x1, y2)] = (x2, y1)

	v_segments, h_segments = [], []
	_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	for vc in vcontours:
		x, y, w, h = cv2.boundingRect(vc)
		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
		v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))

	_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	for hc in hcontours:
		x, y, w, h = cv2.boundingRect(hc)
		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
		h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))

	return tables, v_segments, h_segments