Improve grid detection and add more options

pull/2/head
Vinayak Mehta 2016-06-23 18:30:05 +05:30
parent 47da8606a6
commit f6869a9af4
6 changed files with 111 additions and 74 deletions

3
.gitignore vendored 100644
View File

@ -0,0 +1,3 @@
__pycache__/
*.py[cod]
.camelot/

View File

@ -12,14 +12,17 @@ optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
-p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be -p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be
parsed. Example: -p="1 3-5 9". (default: -p="1") parsed. Example: -p="1 3-5 9", -p="all" (default:
-p="1")
-f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: -f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default:
-f="csv") -f="csv")
-spreadsheet Extract data stored in pdfs with ruling lines. -spreadsheet Extract data stored in pdfs with ruling lines.
(default: False)
-guess [Experimental] Guess the values in empty cells. -F ORIENTATION Fill the values in empty cells. Example: -F="h",
-F="v", -F="hv" (default: None)
-s [SCALE] Scaling factor. Large scaling factor leads to smaller -s [SCALE] Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15) lines being detected. (default: 15)

View File

@ -1,7 +1,9 @@
import os import os
import re import re
import glob import glob
import time
import shutil import shutil
import logging
import subprocess import subprocess
import argparse import argparse
@ -16,62 +18,64 @@ def mkdir(directory):
def filesort(filename): def filesort(filename):
filename = filename.split('/')[-1] filename = filename.split('/')[-1]
return int(pno.findall(filename)[0]) num = pno.findall(filename)
if len(num) == 2:
return (int(num[0]), int(num[1]))
else:
return (int(num[0]), 0)
start_time = time.time()
CAMELOT_DIR = '.camelot/' CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR) mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")') parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")') parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"])
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.') parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)')
parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.') parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('file', nargs=1) parser.add_argument('file', nargs=1)
result = parser.parse_args() result = parser.parse_args()
if result.pages: if result.pages:
p = [] if result.pages == ['all']:
for r in result.pages[0].split(' '): p = result.pages
if '-' in r: else:
a, b = r.split('-') p = []
a, b = int(a), int(b) for r in result.pages[0].split(' '):
p.extend([str(i) for i in range(a, b + 1)]) if '-' in r:
else: a, b = r.split('-')
p.extend([str(r)]) a, b = int(a), int(b)
p.extend([str(i) for i in range(a, b + 1)])
else:
p.extend([str(r)])
else: else:
p = ['1'] p = ['1']
p = sorted(set(p)) p = sorted(set(p))
if result.format: s = result.spreadsheet
f = result.format
else:
f = ['csv']
if result.spreadsheet:
s = True
else:
s = False
pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
mkdir(pdf_dir) mkdir(pdf_dir)
filename = result.file[0].split('/')[-1] filename = result.file[0].split('/')[-1]
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG)
shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages" print "separating pdf into pages"
print print
for page in p: if p == ['all']:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
else:
for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
if s: if s:
print "using the spreadsheet method" print "using the spreadsheet method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image" print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
try: spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale)
spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale)
except:
pass
else: else:
print "using the basic method" print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
@ -91,4 +95,7 @@ if result.format == ['xlsx']:
xlsxpath = os.path.join(pdf_dir, xlsxname) xlsxpath = os.path.join(pdf_dir, xlsxname)
save_data(xlsxpath, data) save_data(xlsxpath, data)
print print
print "saved as", xlsxname print "saved as", xlsxname
print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds")

View File

@ -1,20 +1,6 @@
import cv2 import cv2
import sys
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar
def transform(x, y, img_x, img_y, pdf_x, pdf_y): def transform(x, y, img_x, img_y, pdf_x, pdf_y):
x *= pdf_x / float(img_x) x *= pdf_x / float(img_x)
y = abs(y - img_y) y = abs(y - img_y)
@ -27,9 +13,10 @@ def morph(imagename, p_x, p_y, s):
img_x, img_y = img.shape[1], img.shape[0] img_x, img_y = img.shape[1], img.shape[0]
pdf_x, pdf_y = p_x, p_y pdf_x, pdf_y = p_x, p_y
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) # empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
vertical = th1 threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
horizontal = th1 vertical = threshold
horizontal = threshold
scale = s scale = s
verticalsize = vertical.shape[0] / scale verticalsize = vertical.shape[0] / scale
@ -51,15 +38,22 @@ def morph(imagename, p_x, p_y, s):
tables = {} tables = {}
for c in contours: for c in contours:
x, y, w, h = cv2.boundingRect(c) c_poly = cv2.approxPolyDP(c, 3, True)
jmask = joints[y:y+h, x:x+w] x, y, w, h = cv2.boundingRect(c_poly)
_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) # find number of non-zero values in joints using what boundingRect returns
roi = joints[y:y+h, x:x+w]
_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints if len(jc) <= 4: # remove contours with less than <=4 joints
continue continue
joint_coords = []
for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y)
joint_coords.append((c1, c2))
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
tables[(x1, y2)] = (x2, y1) tables[(x1, y2, x2, y1)] = joint_coords
v_segments, h_segments = [], [] v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

View File

@ -15,12 +15,26 @@ def remove_close_values(ar):
ret.append(a) ret.append(a)
else: else:
temp = ret[-1] temp = ret[-1]
if np.isclose(temp, a, atol=1): if np.isclose(temp, a, atol=2):
pass pass
else: else:
ret.append(a) ret.append(a)
return ret return ret
def merge_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=2):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_idx(t, rows): def get_row_idx(t, rows):
for r in range(len(rows)): for r in range(len(rows)):
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]: if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
@ -40,34 +54,46 @@ def reduce_index(t, r_idx, c_idx):
r_idx -= 1 r_idx -= 1
return r_idx, c_idx return r_idx, c_idx
def fill(t): def fill(t, orientation):
for i in range(len(t.cells)): if orientation == "h":
for j in range(len(t.cells[i])): for i in range(len(t.cells)):
if t.cells[i][j].get_text().strip() == '': for j in range(len(t.cells[i])):
if t.cells[i][j].spanning_h: if t.cells[i][j].get_text().strip() == '':
t.cells[i][j].add_text(t.cells[i][j - 1].get_text()) if t.cells[i][j].spanning_h:
elif t.cells[i][j].spanning_v: t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
t.cells[i][j].add_text(t.cells[i - 1][j].get_text()) elif orientation == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif orientation == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t return t
def spreadsheet(pdf_dir, filename, guess, scale): def spreadsheet(pdf_dir, filename, orientation, scale):
print "working on", filename print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
num_tables = 0 num_tables = 0
for k in sorted(tables.keys(), reverse=True): for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord
# find rows and columns that lie in table # find rows and columns that lie in table
lb = k lb = (k[0], k[1])
rt = tables[k] rt = (k[2], k[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
columns = [v[0] for v in v_s] columns, rows = zip(*tables[k])
rows = [h[1] for h in h_s]
# sort horizontal and vertical segments # sort horizontal and vertical segments
columns = remove_close_values(sorted(columns)) columns = merge_close_values(sorted(list(columns)))
rows = remove_close_values(sorted(rows, reverse=True)) rows = merge_close_values(sorted(list(rows), reverse=True))
# make grid using x and y coord of shortlisted rows and columns # make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -89,8 +115,8 @@ def spreadsheet(pdf_dir, filename, guess, scale):
r_idx, c_idx = reduce_index(table, r_idx, c_idx) r_idx, c_idx = reduce_index(table, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if guess: if orientation:
table = fill(table) table = fill(table, orientation)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname) csvpath = os.path.join(pdf_dir, csvname)

View File

@ -11,9 +11,11 @@ class Table:
for v in vertical: for v in vertical:
# find closest x coord # find closest x coord
# iterate over y coords and find closest points # iterate over y coords and find closest points
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])] i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)]
j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)] j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)] k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
if not j:
continue
if i == [0]: # only left edge if i == [0]: # only left edge
if k: if k:
I = i[0] I = i[0]
@ -65,9 +67,11 @@ class Table:
for h in horizontal: for h in horizontal:
# find closest y coord # find closest y coord
# iterate over x coords and find closest points # iterate over x coords and find closest points
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])] i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)]
j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)] j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)] k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
if not j:
continue
if i == [0]: # only top edge if i == [0]: # only top edge
if k: if k:
I = i[0] I = i[0]