Improve grid detection and add more options

2016-06-23 18:30:05 +05:30 · 2016-06-23 18:30:05 +05:30 · f6869a9af4
parent 47da8606a6
commit f6869a9af4
6 changed files with 111 additions and 74 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 __pycache__/
 *.py[cod]
 .camelot/
--- a/README.md
+++ b/README.md
@ -12,14 +12,17 @@ optional arguments:
  -h, --help            show this help message and exit
  -p PAGES [PAGES ...]  Specify the page numbers and/or page ranges to be
-                        parsed. Example: -p="1 3-5 9". (default: -p="1")
+                        parsed. Example: -p="1 3-5 9", -p="all" (default:
                        -p="1")
  -f FORMAT             Output format (csv/xlsx). Example: -f="xlsx" (default:
                        -f="csv")
  -spreadsheet          Extract data stored in pdfs with ruling lines.
                        (default: False)
-  -guess                [Experimental] Guess the values in empty cells.
+  -F ORIENTATION        Fill the values in empty cells. Example: -F="h",
                        -F="v", -F="hv" (default: None)
  -s [SCALE]            Scaling factor. Large scaling factor leads to smaller
                        lines being detected. (default: 15)
--- a/camelot.py
+++ b/camelot.py
@ -1,7 +1,9 @@
 import os
 import re
 import glob
 import time
 import shutil
 import logging
 import subprocess
 import argparse
@ -16,62 +18,64 @@ def mkdir(directory):
 def filesort(filename):
 	filename = filename.split('/')[-1]
-	return int(pno.findall(filename)[0])
+	num = pno.findall(filename)
 	if len(num) == 2:
 		return (int(num[0]), int(num[1]))
 	else:
 		return (int(num[0]), 0)
 start_time = time.time()
 CAMELOT_DIR = '.camelot/'
 mkdir(CAMELOT_DIR)
 parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
-parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")')
+parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")')
-parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")')
+parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"])
-parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.')
+parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)')
-parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.')
+parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
 parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
 parser.add_argument('file', nargs=1)
 result = parser.parse_args()
 if result.pages:
-	p = []
+	if result.pages == ['all']:
-	for r in result.pages[0].split(' '):
+		p = result.pages
-		if '-' in r:
+	else:
-			a, b = r.split('-')
+		p = []
-			a, b = int(a), int(b)
+		for r in result.pages[0].split(' '):
-			p.extend([str(i) for i in range(a, b + 1)])
+			if '-' in r:
-		else:
+				a, b = r.split('-')
-			p.extend([str(r)])
+				a, b = int(a), int(b)
 				p.extend([str(i) for i in range(a, b + 1)])
 			else:
 				p.extend([str(r)])
 else:
 	p = ['1']
 p = sorted(set(p))
-if result.format:
+s = result.spreadsheet
 	f = result.format
 else:
 	f = ['csv']
 if result.spreadsheet:
 	s = True
 else:
 	s = False
 pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
 mkdir(pdf_dir)
 filename = result.file[0].split('/')[-1]
 logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG)
 shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
 print "separating pdf into pages"
 print
-for page in p:
+if p == ['all']:
-	subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
+	subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
 else:
 	for page in p:
 		subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
 if s:
 	print "using the spreadsheet method"
 	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
 		print "converting", g.split('/')[-1], "to image"
 		os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
-		try:
+		spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale)
 			spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale)
 		except:
 			pass
 else:
 	print "using the basic method"
 	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
@ -91,4 +95,7 @@ if result.format == ['xlsx']:
 	xlsxpath = os.path.join(pdf_dir, xlsxname)
 	save_data(xlsxpath, data)
 	print
-	print "saved as", xlsxname
+	print "saved as", xlsxname
 print "finished in", time.time() - start_time, "seconds"
 logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds")
--- a/morph_transform.py
+++ b/morph_transform.py
@ -1,20 +1,6 @@
 import cv2
 import sys
 import subprocess
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import numpy as np
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar
 def transform(x, y, img_x, img_y, pdf_x, pdf_y):
 	x *= pdf_x / float(img_x)
 	y = abs(y - img_y)
@ -27,9 +13,10 @@ def morph(imagename, p_x, p_y, s):
 	img_x, img_y = img.shape[1], img.shape[0]
 	pdf_x, pdf_y = p_x, p_y
 	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-	th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
+	# empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
-	vertical = th1
+	threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
-	horizontal = th1
+	vertical = threshold
 	horizontal = threshold
 	scale = s
 	verticalsize = vertical.shape[0] / scale
@ -51,15 +38,22 @@ def morph(imagename, p_x, p_y, s):
 	tables = {}
 	for c in contours:
-		x, y, w, h = cv2.boundingRect(c)
+		c_poly = cv2.approxPolyDP(c, 3, True)
-		jmask = joints[y:y+h, x:x+w]
+		x, y, w, h = cv2.boundingRect(c_poly)
-		_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+		# find number of non-zero values in joints using what boundingRect returns
-		
+		roi = joints[y:y+h, x:x+w]
 		_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
 		if len(jc) <= 4: # remove contours with less than <=4 joints
 			continue
 		joint_coords = []
 		for j in jc:
 			jx, jy, jw, jh = cv2.boundingRect(j)
 			c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2
 			c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y)
 			joint_coords.append((c1, c2))
 		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
 		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
-		tables[(x1, y2)] = (x2, y1)
+		tables[(x1, y2, x2, y1)] = joint_coords
 	v_segments, h_segments = [], []
 	_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
--- a/spreadsheet.py
+++ b/spreadsheet.py
@ -15,12 +15,26 @@ def remove_close_values(ar):
 			ret.append(a)
 		else:
 			temp = ret[-1]
-			if np.isclose(temp, a, atol=1):
+			if np.isclose(temp, a, atol=2):
 				pass
 			else:
 				ret.append(a)
 	return ret
 def merge_close_values(ar):
 	ret = []
 	for a in ar:
 		if not ret:
 			ret.append(a)
 		else:
 			temp = ret[-1]
 			if np.isclose(temp, a, atol=2):
 				temp = (temp + a) / 2.0
 				ret[-1] = temp
 			else:
 				ret.append(a)
 	return ret
 def get_row_idx(t, rows):
 	for r in range(len(rows)):
 		if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
@ -40,34 +54,46 @@ def reduce_index(t, r_idx, c_idx):
 			r_idx -= 1
 	return r_idx, c_idx
-def fill(t):
+def fill(t, orientation):
-	for i in range(len(t.cells)):
+	if orientation == "h":
-		for j in range(len(t.cells[i])):
+		for i in range(len(t.cells)):
-			if t.cells[i][j].get_text().strip() == '':
+			for j in range(len(t.cells[i])):
-				if t.cells[i][j].spanning_h:
+				if t.cells[i][j].get_text().strip() == '':
-					t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
+					if t.cells[i][j].spanning_h:
-				elif t.cells[i][j].spanning_v:
+						t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
-					t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
+	elif orientation == "v":
 		for i in range(len(t.cells)):
 			for j in range(len(t.cells[i])):
 				if t.cells[i][j].get_text().strip() == '':
 					if t.cells[i][j].spanning_v:
 						t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
 	elif orientation == "hv":
 		for i in range(len(t.cells)):
 			for j in range(len(t.cells[i])):
 				if t.cells[i][j].get_text().strip() == '':
 					if t.cells[i][j].spanning_h:
 						t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
 					elif t.cells[i][j].spanning_v:
 						t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
 	return t
-def spreadsheet(pdf_dir, filename, guess, scale):
+def spreadsheet(pdf_dir, filename, orientation, scale):
 	print "working on", filename
 	imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
 	text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
 	tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
 	num_tables = 0
-	for k in sorted(tables.keys(), reverse=True):
+	for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord
 		# find rows and columns that lie in table
-		lb = k
+		lb = (k[0], k[1])
-		rt = tables[k]
+		rt = (k[2], k[3])
 		v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
 		h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
-		columns = [v[0] for v in v_s]
+		columns, rows = zip(*tables[k])
 		rows = [h[1] for h in h_s]
 		# sort horizontal and vertical segments
-		columns = remove_close_values(sorted(columns))
+		columns = merge_close_values(sorted(list(columns)))
-		rows = remove_close_values(sorted(rows, reverse=True))
+		rows = merge_close_values(sorted(list(rows), reverse=True))
 		# make grid using x and y coord of shortlisted rows and columns
 		columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
 		rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
@ -89,8 +115,8 @@ def spreadsheet(pdf_dir, filename, guess, scale):
 				r_idx, c_idx = reduce_index(table, r_idx, c_idx)
 				table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
-		if guess:
+		if orientation:
-			table = fill(table)
+			table = fill(table, orientation)
 		csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
 		csvpath = os.path.join(pdf_dir, csvname)
--- a/table.py
+++ b/table.py
@ -11,9 +11,11 @@ class Table:
 		for v in vertical:
 			# find closest x coord
 			# iterate over y coords and find closest points
-			i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])]
+			i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)]
 			j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
 			k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
 			if not j:
 				continue
 			if i == [0]: # only left edge
 				if k:
 					I = i[0]
@ -65,9 +67,11 @@ class Table:
 		for h in horizontal:
 			#  find closest y coord
 			# iterate over x coords and find closest points
-			i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])]
+			i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)]
 			j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
 			k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
 			if not j:
 				continue
 			if i == [0]: # only top edge
 				if k:
 					I = i[0]