First commit 🔥

2016-06-18 17:20:42 +05:30
commit eef07a86c6
14 changed files with 585 additions and 0 deletions
@@ -0,0 +1,27 @@
 Camelot
 -------
 usage: python2 camelot.py [options] pdf_file
 Parse yo pdf!
 positional arguments:
  file
 optional arguments:
  -h, --help            show this help message and exit
  -p PAGES [PAGES ...]  Specify the page numbers and/or page ranges to be
                        parsed. Example: -p="1 3-5 9". (default: -p="1")
  -f FORMAT             Output format (csv/xlsx). Example: -f="xlsx" (default:
                        -f="csv")
  -spreadsheet          Extract data stored in pdfs with ruling lines.
  -guess                [Experimental] Guess the values in empty cells.
  -s [SCALE]            Scaling factor. Large scaling factor leads to smaller
                        lines being detected. (default: 15)
 Under construction...
@@ -0,0 +1,58 @@
 import os
 import csv
 import numpy as np
 from pdf import get_pdf_info
 def overlap(l):
 	merged = []
 	for higher in l:
 		if not merged:
 			merged.append(higher)
 		else:
 			lower = merged[-1]
 			if higher[0] >= lower[0] and higher[1] <= lower[1]:
 				upper_bound = max(lower[1], higher[1])
 				lower_bound = min(lower[0], higher[0])
 				merged[-1] = (lower_bound, upper_bound)
 			else:
 				merged.append(higher)
 	return merged
 def get_row_idx(t, rows):
 	for r in range(len(rows)):
 		if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
 			return r
 def get_column_idx(t, columns):
 	for c in range(len(columns)):
 		if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
 			return c
 def basic(pdf_dir, filename):
 	print "working on", filename
 	text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
 	rows, columns = [], []
 	for t in text:
 		rows.append((t.y1, t.y0))
 		columns.append((t.x0, t.x1))
 	rows = list(set(rows))
 	rows = sorted(rows, reverse=True)
 	columns = list(set(columns))
 	columns = sorted(columns)
 	columns = overlap(columns)
 	table = [['' for c in columns] for r in rows]
 	for t in text:
 		r_idx = get_row_idx(t, rows)
 		c_idx = get_column_idx(t, columns)
 		if None in [r_idx, c_idx]:
 			print t
 		else:
 			table[r_idx][c_idx] = t.get_text().strip('\n')
 	csvname = filename.split('.')[0] + '.csv'
 	csvpath = os.path.join(pdf_dir, csvname)
 	with open(csvpath, 'w') as outfile:
 		writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
 	 	for cell in table:
 			writer.writerow([ce for ce in cell])
@@ -0,0 +1,94 @@
 import os
 import re
 import glob
 import shutil
 import subprocess
 import argparse
 from basic import basic
 from spreadsheet import spreadsheet
 pno = re.compile(r'\d+')
 def mkdir(directory):
    if not os.path.isdir(directory):
        os.makedirs(directory)
 def filesort(filename):
 	filename = filename.split('/')[-1]
 	return int(pno.findall(filename)[0])
 CAMELOT_DIR = '.camelot/'
 mkdir(CAMELOT_DIR)
 parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
 parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")')
 parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")')
 parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.')
 parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.')
 parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
 parser.add_argument('file', nargs=1)
 result = parser.parse_args()
 if result.pages:
 	p = []
 	for r in result.pages[0].split(' '):
 		if '-' in r:
 			a, b = r.split('-')
 			a, b = int(a), int(b)
 			p.extend([str(i) for i in range(a, b + 1)])
 		else:
 			p.extend([str(r)])
 else:
 	p = ['1']
 p = sorted(set(p))
 if result.format:
 	f = result.format
 else:
 	f = ['csv']
 if result.spreadsheet:
 	s = True
 else:
 	s = False
 pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
 mkdir(pdf_dir)
 filename = result.file[0].split('/')[-1]
 shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
 print "separating pdf into pages"
 print
 for page in p:
 	subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
 if s:
 	print "using the spreadsheet method"
 	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
 		print "converting", g.split('/')[-1], "to image"
 		os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
 		try:
 			spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale)
 		except:
 			pass
 else:
 	print "using the basic method"
 	for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
 		basic(pdf_dir, g.split('/')[-1])
 if result.format == ['xlsx']:
 	import csv
 	from pyexcel_xlsx import save_data
 	from collections import OrderedDict
 	data = OrderedDict()
 	for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
 		print "adding", c.split('/')[-1], "to excel file"
 		with open(c, 'r') as csvfile:
 			reader = csv.reader(csvfile)
 			data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]})
 	xlsxname = filename.split('.')[0] + '.xlsx'
 	xlsxpath = os.path.join(pdf_dir, xlsxname)
 	save_data(xlsxpath, data)
 	print
 	print "saved as", xlsxname
@@ -0,0 +1,23 @@
 class Cell:
 	def __init__(self, x1, y1, x2, y2):
 		self.lb = (x1, y1)
 		self.lt = (x1, y2)
 		self.rb = (x2, y1)
 		self.rt = (x2, y2)
 		self.bbox = (x1, y1, x2, y2)
 		self.left = False
 		self.right = False
 		self.top = False
 		self.bottom = False
 		self.text = ''
 		self.spanning_h = False
 		self.spanning_v = False
 	def add_text(self, text):
 		self.text += text
 	def get_text(self):
 		return self.text
 	def get_bounded_edges(self):
 		return self.top + self.bottom + self.left + self.right
@@ -0,0 +1,79 @@
 import cv2
 import sys
 import subprocess
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import numpy as np
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar
 def transform(x, y, img_x, img_y, pdf_x, pdf_y):
 	x *= pdf_x / float(img_x)
 	y = abs(y - img_y)
 	y *= pdf_y / float(img_y)
 	return x, y
 # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
 def morph(imagename, p_x, p_y, s):
 	img = cv2.imread(imagename)
 	img_x, img_y = img.shape[1], img.shape[0]
 	pdf_x, pdf_y = p_x, p_y
 	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
 	th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
 	vertical = th1
 	horizontal = th1
 	scale = s
 	verticalsize = vertical.shape[0] / scale
 	horizontalsize = horizontal.shape[1] / scale
 	ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
 	hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
 	vertical = cv2.erode(vertical, ver, (-1, -1))
 	vertical = cv2.dilate(vertical, ver, (-1, -1))
 	horizontal = cv2.erode(horizontal, hor, (-1, -1))
 	horizontal = cv2.dilate(horizontal, hor, (-1, -1))
 	mask = vertical + horizontal
 	joints = np.bitwise_and(vertical, horizontal)
 	_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 	tables = {}
 	for c in contours:
 		x, y, w, h = cv2.boundingRect(c)
 		jmask = joints[y:y+h, x:x+w]
 		_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
 		if len(jc) <= 4: # remove contours with less than <=4 joints
 			continue
 		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
 		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
 		tables[(x1, y2)] = (x2, y1)
 	v_segments, h_segments = [], []
 	_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 	for vc in vcontours:
 		x, y, w, h = cv2.boundingRect(vc)
 		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
 		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
 		v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
 	_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 	for hc in hcontours:
 		x, y, w, h = cv2.boundingRect(hc)
 		x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
 		x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
 		h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
 	return tables, v_segments, h_segments
@@ -0,0 +1,54 @@
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
 text = []
 def parse_text_basic(layout):
 	global text
 	try:
 		for obj in layout._objs:
 			if type(obj) is LTTextLineHorizontal:
 				text.append(obj)
 			parse_text_basic(obj)
 	except AttributeError:
 		pass
 def parse_text_spreadsheet(layout):
 	global text
 	try:
 		for obj in layout._objs:
 			if type(obj) is LTChar:
 				text.append(obj)
 			parse_text_spreadsheet(obj)
 	except AttributeError:
 		pass
 def get_pdf_info(pdfname, method):
 	global text
 	with open(pdfname, 'r') as f:
 		parser = PDFParser(f)
 		document = PDFDocument(parser)
 		if not document.is_extractable:
 			raise PDFTextExtractionNotAllowed
 		laparams = LAParams()
 		rsrcmgr = PDFResourceManager()
 		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 		interpreter = PDFPageInterpreter(rsrcmgr, device)
 		for page in PDFPage.create_pages(document):
 			interpreter.process_page(page)
 			layout = device.get_result()
 			text = []
 			if method == 'basic':
 				parse_text_basic(layout)
 			elif method == 'spreadsheet':
 				parse_text_spreadsheet(layout)
 			pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
 		text.sort(key=lambda x: (-x.y0, x.x0))
 	return text, pdf_x, pdf_y
@@ -0,0 +1,103 @@
 import os
 import csv
 import glob
 import numpy as np
 import matplotlib.pyplot as plt
 from table import Table
 from pdf import get_pdf_info
 from morph_transform import morph
 def remove_close_values(ar):
 	ret = []
 	for a in ar:
 		if not ret:
 			ret.append(a)
 		else:
 			temp = ret[-1]
 			if np.isclose(temp, a, atol=1):
 				pass
 			else:
 				ret.append(a)
 	return ret
 def get_row_idx(t, rows):
 	for r in range(len(rows)):
 		if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
 			return r
 def get_column_idx(t, columns):
 	for c in range(len(columns)):
 		if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
 			return c
 def reduce_index(t, r_idx, c_idx):
 	if t.cells[r_idx][c_idx].spanning_h:
 		while not t.cells[r_idx][c_idx].left:
 			c_idx -= 1
 	if t.cells[r_idx][c_idx].spanning_v:
 		while not t.cells[r_idx][c_idx].top:
 			r_idx -= 1
 	return r_idx, c_idx
 def fill(t):
 	for i in range(len(t.cells)):
 		for j in range(len(t.cells[i])):
 			if t.cells[i][j].get_text().strip() == '':
 				if t.cells[i][j].spanning_h:
 					t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
 				elif t.cells[i][j].spanning_v:
 					t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
 	return t
 def spreadsheet(pdf_dir, filename, guess, scale):
 	print "working on", filename
 	imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
 	text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
 	tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
 	num_tables = 0
 	for k in sorted(tables.keys(), reverse=True):
 		# find rows and columns that lie in table
 		lb = k
 		rt = tables[k]
 		v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
 		h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
 		columns = [v[0] for v in v_s]
 		rows = [h[1] for h in h_s]
 		# sort horizontal and vertical segments
 		columns = remove_close_values(sorted(columns))
 		rows = remove_close_values(sorted(rows, reverse=True))
 		# make grid using x and y coord of shortlisted rows and columns
 		columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
 		rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
 		table = Table(columns, rows)
 		# pass row and column line segments to table method and light up cell edges
 		table = table.set_edges(v_s, h_s)
 		# table set span method
 		table = table.set_spanning()
 		# fill text after sorting it
 		text.sort(key=lambda x: (-x.y0, x.x0))
 		for t in text:
 			r_idx = get_row_idx(t, rows)
 			c_idx = get_column_idx(t, columns)
 			if None in [r_idx, c_idx]:
 				pass
 			else:
 				r_idx, c_idx = reduce_index(table, r_idx, c_idx)
 				table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
 		if guess:
 			table = fill(table)
 		csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
 		csvpath = os.path.join(pdf_dir, csvname)
 		with open(csvpath, 'w') as outfile:
 			writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
 		 	for i in range(len(table.cells)):
 				writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))])
 			print "saved as", csvname
 			print
 		num_tables += 1
@@ -0,0 +1,147 @@
 import numpy as np
 from cell import Cell
 class Table:
 	def __init__(self, columns, rows):
 		self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows]
 		self.columns = columns
 		self.rows = rows
 	def set_edges(self, vertical, horizontal):
 		for v in vertical:
 			# find closest x coord
 			# iterate over y coords and find closest points
 			i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])]
 			j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
 			k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
 			if i == [0]: # only left edge
 				if k:
 					I = i[0]
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[J][I].left = True
 						J += 1
 				else:
 					I = i[0]
 					J = j[0]
 					K = len(self.rows)
 					while J < K:
 						self.cells[J][I].left = True
 						J += 1
 			elif i == []: # only right edge
 				if k:
 					I = len(self.columns) - 1
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[J][I].right = True
 						J += 1
 				else:
 					I = len(self.columns) - 1
 					J = j[0]
 					K = len(self.rows)
 					while J < K:
 						self.cells[J][I].right = True
 						J += 1
 			else: # both left and right edges
 				if k:
 					I = i[0]
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[J][I].left = True
 						self.cells[J][I - 1].right = True
 						J += 1
 				else:
 					I = i[0]
 					J = j[0]
 					K = len(self.rows)
 					while J < K:
 						self.cells[J][I].left = True
 						self.cells[J][I - 1].right = True
 						J += 1
 		for h in horizontal:
 			#  find closest y coord
 			# iterate over x coords and find closest points
 			i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])]
 			j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
 			k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
 			if i == [0]: # only top edge
 				if k:
 					I = i[0]
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[I][J].top = True
 						J += 1
 				else:
 					I = i[0]
 					J = j[0]
 					K = len(self.columns)
 					while J < K:
 						self.cells[I][J].top = True
 						J += 1
 			elif i == []: # only bottom edge
 				if k:
 					I = len(self.rows) - 1
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[I][J].bottom = True
 						J += 1
 				else:
 					I = len(self.rows) - 1
 					J = j[0]
 					K = len(self.columns)
 					while J < K:
 						self.cells[I][J].bottom = True
 						J += 1
 			else: # both top and bottom edges
 				if k:
 					I = i[0]
 					J = j[0]
 					K = k[0]
 					while J < K:
 						self.cells[I][J].top = True
 						self.cells[I - 1][J].bottom = True
 						J += 1
 				else:
 					I = i[0]
 					J = j[0]
 					K = len(self.columns)
 					while J < K:
 						self.cells[I][J].top = True
 						self.cells[I - 1][J].bottom = True
 						J += 1
 		return self
 	def set_spanning(self):
 		for i in range(len(self.cells)):
 			for j in range(len(self.cells[i])):
 				bound = self.cells[i][j].get_bounded_edges()
 				if bound == 4:
 					continue
 				elif bound == 3:
 					if not self.cells[i][j].left:
 						if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
 							self.cells[i][j].spanning_h = True
 					elif not self.cells[i][j].right:
 						if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
 							self.cells[i][j].spanning_h = True
 					elif not self.cells[i][j].top:
 						if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
 							self.cells[i][j].spanning_v = True
 					elif not self.cells[i][j].bottom:
 						if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
 							self.cells[i][j].spanning_v = True
 				elif bound == 2:
 					if self.cells[i][j].left and self.cells[i][j].right:
 						if not self.cells[i][j].top and not self.cells[i][j].bottom:
 							self.cells[i][j].spanning_v = True
 					elif self.cells[i][j].top and self.cells[i][j].bottom:
 						if not self.cells[i][j].left and not self.cells[i][j].right:
 							self.cells[i][j].spanning_h = True
 		return self