79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
import cv2
|
|
import sys
|
|
import subprocess
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
import numpy as np
|
|
|
|
from pdfminer.pdfparser import PDFParser
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.pdfpage import PDFPage
|
|
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
from pdfminer.pdfinterp import PDFResourceManager
|
|
from pdfminer.pdfinterp import PDFPageInterpreter
|
|
from pdfminer.pdfdevice import PDFDevice
|
|
from pdfminer.converter import PDFPageAggregator
|
|
from pdfminer.layout import LAParams, LTChar
|
|
|
|
def transform(x, y, img_x, img_y, pdf_x, pdf_y):
|
|
x *= pdf_x / float(img_x)
|
|
y = abs(y - img_y)
|
|
y *= pdf_y / float(img_y)
|
|
return x, y
|
|
|
|
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
|
|
def morph(imagename, p_x, p_y, s):
|
|
img = cv2.imread(imagename)
|
|
img_x, img_y = img.shape[1], img.shape[0]
|
|
pdf_x, pdf_y = p_x, p_y
|
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
|
|
vertical = th1
|
|
horizontal = th1
|
|
|
|
scale = s
|
|
verticalsize = vertical.shape[0] / scale
|
|
horizontalsize = horizontal.shape[1] / scale
|
|
|
|
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
|
|
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
|
|
|
|
vertical = cv2.erode(vertical, ver, (-1, -1))
|
|
vertical = cv2.dilate(vertical, ver, (-1, -1))
|
|
|
|
horizontal = cv2.erode(horizontal, hor, (-1, -1))
|
|
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
|
|
|
|
mask = vertical + horizontal
|
|
joints = np.bitwise_and(vertical, horizontal)
|
|
_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
|
|
|
|
tables = {}
|
|
for c in contours:
|
|
x, y, w, h = cv2.boundingRect(c)
|
|
jmask = joints[y:y+h, x:x+w]
|
|
_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
if len(jc) <= 4: # remove contours with less than <=4 joints
|
|
continue
|
|
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
|
|
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
|
|
tables[(x1, y2)] = (x2, y1)
|
|
|
|
v_segments, h_segments = [], []
|
|
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
for vc in vcontours:
|
|
x, y, w, h = cv2.boundingRect(vc)
|
|
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
|
|
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
|
|
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
|
|
|
|
_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
for hc in hcontours:
|
|
x, y, w, h = cv2.boundingRect(hc)
|
|
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
|
|
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
|
|
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
|
|
|
|
return tables, v_segments, h_segments |