First commit 🔥

pull/2/head
Vinayak Mehta 2016-06-18 17:20:42 +05:30
commit eef07a86c6
14 changed files with 585 additions and 0 deletions

27
README.md 100644
View File

@ -0,0 +1,27 @@
Camelot
-------
usage: python2 camelot.py [options] pdf_file
Parse yo pdf!
positional arguments:
file
optional arguments:
-h, --help show this help message and exit
-p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be
parsed. Example: -p="1 3-5 9". (default: -p="1")
-f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default:
-f="csv")
-spreadsheet Extract data stored in pdfs with ruling lines.
-guess [Experimental] Guess the values in empty cells.
-s [SCALE] Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15)
Under construction...

58
basic.py 100644
View File

@ -0,0 +1,58 @@
import os
import csv
import numpy as np
from pdf import get_pdf_info
def overlap(l):
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if higher[0] >= lower[0] and higher[1] <= lower[1]:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
def get_row_idx(t, rows):
for r in range(len(rows)):
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
return c
def basic(pdf_dir, filename):
print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
rows, columns = [], []
for t in text:
rows.append((t.y1, t.y0))
columns.append((t.x0, t.x1))
rows = list(set(rows))
rows = sorted(rows, reverse=True)
columns = list(set(columns))
columns = sorted(columns)
columns = overlap(columns)
table = [['' for c in columns] for r in rows]
for t in text:
r_idx = get_row_idx(t, rows)
c_idx = get_column_idx(t, columns)
if None in [r_idx, c_idx]:
print t
else:
table[r_idx][c_idx] = t.get_text().strip('\n')
csvname = filename.split('.')[0] + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for cell in table:
writer.writerow([ce for ce in cell])

BIN
basic.pyc 100644

Binary file not shown.

94
camelot.py 100644
View File

@ -0,0 +1,94 @@
import os
import re
import glob
import shutil
import subprocess
import argparse
from basic import basic
from spreadsheet import spreadsheet
pno = re.compile(r'\d+')
def mkdir(directory):
if not os.path.isdir(directory):
os.makedirs(directory)
def filesort(filename):
filename = filename.split('/')[-1]
return int(pno.findall(filename)[0])
CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file')
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9". (default: -p="1")')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")')
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines.')
parser.add_argument('-guess', action='store_true', dest='guess', help='[Experimental] Guess the values in empty cells.')
parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('file', nargs=1)
result = parser.parse_args()
if result.pages:
p = []
for r in result.pages[0].split(' '):
if '-' in r:
a, b = r.split('-')
a, b = int(a), int(b)
p.extend([str(i) for i in range(a, b + 1)])
else:
p.extend([str(r)])
else:
p = ['1']
p = sorted(set(p))
if result.format:
f = result.format
else:
f = ['csv']
if result.spreadsheet:
s = True
else:
s = False
pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
mkdir(pdf_dir)
filename = result.file[0].split('/')[-1]
shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages"
print
for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
if s:
print "using the spreadsheet method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png']))
try:
spreadsheet(pdf_dir, g.split('/')[-1], result.guess, result.scale)
except:
pass
else:
print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
basic(pdf_dir, g.split('/')[-1])
if result.format == ['xlsx']:
import csv
from pyexcel_xlsx import save_data
from collections import OrderedDict
data = OrderedDict()
for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
print "adding", c.split('/')[-1], "to excel file"
with open(c, 'r') as csvfile:
reader = csv.reader(csvfile)
data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]})
xlsxname = filename.split('.')[0] + '.xlsx'
xlsxpath = os.path.join(pdf_dir, xlsxname)
save_data(xlsxpath, data)
print
print "saved as", xlsxname

23
cell.py 100644
View File

@ -0,0 +1,23 @@
class Cell:
def __init__(self, x1, y1, x2, y2):
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.text = ''
self.spanning_h = False
self.spanning_v = False
def add_text(self, text):
self.text += text
def get_text(self):
return self.text
def get_bounded_edges(self):
return self.top + self.bottom + self.left + self.right

BIN
cell.pyc 100644

Binary file not shown.

79
morph_transform.py 100644
View File

@ -0,0 +1,79 @@
import cv2
import sys
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar
def transform(x, y, img_x, img_y, pdf_x, pdf_y):
x *= pdf_x / float(img_x)
y = abs(y - img_y)
y *= pdf_y / float(img_y)
return x, y
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
def morph(imagename, p_x, p_y, s):
img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0]
pdf_x, pdf_y = p_x, p_y
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
th1 = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
vertical = th1
horizontal = th1
scale = s
verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal)
_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {}
for c in contours:
x, y, w, h = cv2.boundingRect(c)
jmask = joints[y:y+h, x:x+w]
_, jc, _ = cv2.findContours(jmask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints
continue
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
tables[(x1, y2)] = (x2, y1)
v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc)
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc)
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return tables, v_segments, h_segments

BIN
morph_transform.pyc 100644

Binary file not shown.

54
pdf.py 100644
View File

@ -0,0 +1,54 @@
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
text = []
def parse_text_basic(layout):
global text
try:
for obj in layout._objs:
if type(obj) is LTTextLineHorizontal:
text.append(obj)
parse_text_basic(obj)
except AttributeError:
pass
def parse_text_spreadsheet(layout):
global text
try:
for obj in layout._objs:
if type(obj) is LTChar:
text.append(obj)
parse_text_spreadsheet(obj)
except AttributeError:
pass
def get_pdf_info(pdfname, method):
global text
with open(pdfname, 'r') as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
laparams = LAParams()
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
text = []
if method == 'basic':
parse_text_basic(layout)
elif method == 'spreadsheet':
parse_text_spreadsheet(layout)
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
text.sort(key=lambda x: (-x.y0, x.x0))
return text, pdf_x, pdf_y

BIN
pdf.pyc 100644

Binary file not shown.

103
spreadsheet.py 100644
View File

@ -0,0 +1,103 @@
import os
import csv
import glob
import numpy as np
import matplotlib.pyplot as plt
from table import Table
from pdf import get_pdf_info
from morph_transform import morph
def remove_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=1):
pass
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, r_idx, c_idx):
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def fill(t):
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def spreadsheet(pdf_dir, filename, guess, scale):
print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
num_tables = 0
for k in sorted(tables.keys(), reverse=True):
# find rows and columns that lie in table
lb = k
rt = tables[k]
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
columns = [v[0] for v in v_s]
rows = [h[1] for h in h_s]
# sort horizontal and vertical segments
columns = remove_close_values(sorted(columns))
rows = remove_close_values(sorted(rows, reverse=True))
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
table = Table(columns, rows)
# pass row and column line segments to table method and light up cell edges
table = table.set_edges(v_s, h_s)
# table set span method
table = table.set_spanning()
# fill text after sorting it
text.sort(key=lambda x: (-x.y0, x.x0))
for t in text:
r_idx = get_row_idx(t, rows)
c_idx = get_column_idx(t, columns)
if None in [r_idx, c_idx]:
pass
else:
r_idx, c_idx = reduce_index(table, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if guess:
table = fill(table)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for i in range(len(table.cells)):
writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))])
print "saved as", csvname
print
num_tables += 1

BIN
spreadsheet.pyc 100644

Binary file not shown.

147
table.py 100644
View File

@ -0,0 +1,147 @@
import numpy as np
from cell import Cell
class Table:
def __init__(self, columns, rows):
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows]
self.columns = columns
self.rows = rows
def set_edges(self, vertical, horizontal):
for v in vertical:
# find closest x coord
# iterate over y coords and find closest points
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0])]
j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
if i == [0]: # only left edge
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].left = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].left = True
J += 1
elif i == []: # only right edge
if k:
I = len(self.columns) - 1
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].right = True
J += 1
else:
I = len(self.columns) - 1
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].right = True
J += 1
else: # both left and right edges
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].left = True
self.cells[J][I - 1].right = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].left = True
self.cells[J][I - 1].right = True
J += 1
for h in horizontal:
# find closest y coord
# iterate over x coords and find closest points
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0])]
j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)]
k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)]
if i == [0]: # only top edge
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[I][J].top = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.columns)
while J < K:
self.cells[I][J].top = True
J += 1
elif i == []: # only bottom edge
if k:
I = len(self.rows) - 1
J = j[0]
K = k[0]
while J < K:
self.cells[I][J].bottom = True
J += 1
else:
I = len(self.rows) - 1
J = j[0]
K = len(self.columns)
while J < K:
self.cells[I][J].bottom = True
J += 1
else: # both top and bottom edges
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.columns)
while J < K:
self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True
J += 1
return self
def set_spanning(self):
for i in range(len(self.cells)):
for j in range(len(self.cells[i])):
bound = self.cells[i][j].get_bounded_edges()
if bound == 4:
continue
elif bound == 3:
if not self.cells[i][j].left:
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].right:
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True
elif not self.cells[i][j].top:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True
elif not self.cells[i][j].bottom:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
self.cells[i][j].spanning_v = True
elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right:
if not self.cells[i][j].top and not self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom:
if not self.cells[i][j].left and not self.cells[i][j].right:
self.cells[i][j].spanning_h = True
return self

BIN
table.pyc 100644

Binary file not shown.