Improve grid detection and add more options

pull/2/head
Vinayak Mehta 2016-06-23 18:30:05 +05:30
parent 47da8606a6
commit f6869a9af4
6 changed files with 111 additions and 74 deletions

3
.gitignore vendored 100644
View File

@ -0,0 +1,3 @@
__pycache__/
*.py[cod]
.camelot/

View File

@ -12,14 +12,17 @@ optional arguments:
-h, --help show this help message and exit
-p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be
parsed. Example: -p="1 3-5 9". (default: -p="1")
parsed. Example: -p="1 3-5 9", -p="all" (default:
-p="1")
-f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default:
-f="csv")
-spreadsheet Extract data stored in pdfs with ruling lines.
(default: False)
-guess [Experimental] Guess the values in empty cells.
-F ORIENTATION Fill the values in empty cells. Example: -F="h",
-F="v", -F="hv" (default: None)
-s [SCALE] Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15)

View File

@ -1,7 +1,9 @@
import os
import re
import glob
import time
import shutil
import logging
import subprocess
import argparse
@ -16,22 +18,30 @@ def mkdir(directory):
def filesort(filename):
filename = filename.split('/')[-1]
return int(pno.findall(filename)[0])
num = pno.findall(filename)
if len(num) == 2:
return (int(num[0]), int(num[1]))
else:
return (int(num[0]), 0)
start_time = time.time()
CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")')
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.')
parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.')
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"])
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)')
parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('file', nargs=1)
result = parser.parse_args()
if result.pages:
if result.pages == ['all']:
p = result.pages
else:
p = []
for r in result.pages[0].split(' '):
if '-' in r:
@ -44,22 +54,19 @@ else:
p = ['1']
p = sorted(set(p))
if result.format:
f = result.format
else:
f = ['csv']
if result.spreadsheet:
s = True
else:
s = False
s = result.spreadsheet
pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
mkdir(pdf_dir)
filename = result.file[0].split('/')[-1]
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG)
shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages"
print
if p == ['all']:
subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
else:
for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
@ -68,10 +75,7 @@ if s:
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
try:
spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale)
except:
pass
spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale)
else:
print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
@ -92,3 +96,6 @@ if result.format == ['xlsx']:
save_data(xlsxpath, data)
print
print "saved as", xlsxname
print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds")

View File

@ -1,20 +1,6 @@
import cv2
import sys
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar
def transform(x, y, img_x, img_y, pdf_x, pdf_y):
x *= pdf_x / float(img_x)
y = abs(y - img_y)
@ -27,9 +13,10 @@ def morph(imagename, p_x, p_y, s):
img_x, img_y = img.shape[1], img.shape[0]
pdf_x, pdf_y = p_x, p_y
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
vertical = th1
horizontal = th1
# empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
vertical = threshold
horizontal = threshold
scale = s
verticalsize = vertical.shape[0] / scale
@ -51,15 +38,22 @@ def morph(imagename, p_x, p_y, s):
tables = {}
for c in contours:
x, y, w, h = cv2.boundingRect(c)
jmask = joints[y:y+h, x:x+w]
_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly)
# find number of non-zero values in joints using what boundingRect returns
roi = joints[y:y+h, x:x+w]
_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y)
joint_coords.append((c1, c2))
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
tables[(x1, y2)] = (x2, y1)
tables[(x1, y2, x2, y1)] = joint_coords
v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

View File

@ -15,12 +15,26 @@ def remove_close_values(ar):
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=1):
if np.isclose(temp, a, atol=2):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=2):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
@ -40,7 +54,20 @@ def reduce_index(t, r_idx, c_idx):
r_idx -= 1
return r_idx, c_idx
def fill(t):
def fill(t, orientation):
if orientation == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif orientation == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif orientation == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
@ -50,24 +77,23 @@ def fill(t):
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def spreadsheet(pdf_dir, filename, guess, scale):
def spreadsheet(pdf_dir, filename, orientation, scale):
print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
num_tables = 0
for k in sorted(tables.keys(), reverse=True):
for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord
# find rows and columns that lie in table
lb = k
rt = tables[k]
lb = (k[0], k[1])
rt = (k[2], k[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
columns = [v[0] for v in v_s]
rows = [h[1] for h in h_s]
columns, rows = zip(*tables[k])
# sort horizontal and vertical segments
columns = remove_close_values(sorted(columns))
rows = remove_close_values(sorted(rows, reverse=True))
columns = merge_close_values(sorted(list(columns)))
rows = merge_close_values(sorted(list(rows), reverse=True))
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -89,8 +115,8 @@ def spreadsheet(pdf_dir, filename, guess, scale):
r_idx, c_idx = reduce_index(table, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if guess:
table = fill(table)
if orientation:
table = fill(table, orientation)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname)

View File

@ -11,9 +11,11 @@ class Table:
for v in vertical:
# find closest x coord
# iterate over y coords and find closest points
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])]
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)]
j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
if not j:
continue
if i == [0]: # only left edge
if k:
I = i[0]
@ -65,9 +67,11 @@ class Table:
for h in horizontal:
# find closest y coord
# iterate over x coords and find closest points
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])]
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)]
j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
if not j:
continue
if i == [0]: # only top edge
if k:
I = i[0]