Make code PEP8 compliant

pull/2/head
Vinayak Mehta 2016-07-11 15:19:38 +05:30
parent f6869a9af4
commit b87d2350dc
9 changed files with 765 additions and 489 deletions

View File

@ -1,30 +1,70 @@
Camelot Camelot
------- -------
usage: python2 camelot.py [options] pdf_file Description: Parse tables from pdfs!
Parse yo pdf! Dependencies
Install
Usage: python2 camelot.py [options] file
positional arguments: positional arguments:
file file
optional arguments: optional arguments:
-h, --help show this help message and exit
-p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be -h, --help
parsed. Example: -p="1 3-5 9", -p="all" (default:
-p="1")
-f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: show this help message and exit
-f="csv")
-spreadsheet Extract data stored in pdfs with ruling lines. -p, --pages PAGES [PAGES ...]
(default: False)
-F ORIENTATION Fill the values in empty cells. Example: -F="h", Specify the page numbers and/or page ranges to be
-F="v", -F="hv" (default: None) parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-s [SCALE] Scaling factor. Large scaling factor leads to smaller -f, --format FORMAT
lines being detected. (default: 15)
Under construction... Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-m, --spreadsheet
Extract tables with ruling lines. (default: False)
-F, --fill FILL
Fill the values in empty cells horizontally(h) and/or
vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-s, --scale [SCALE]
Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15)
-j, --jtol [JTOL]
Tolerance to account for when comparing joint and line
coordinates. (default: 2)
-M, --mtol [MTOL]
Tolerance to account for when merging lines which are
very close. (default: 2)
-i, --invert
Make sure lines are in foreground. (default: False)
-d, --debug DEBUG
Debug by visualizing contours, lines, joints, tables.
Example: --debug="contours"
-o, --output OUTPUT
Specify output directory.
Development: Code, Contributing, Tests
License

109
basic.py
View File

@ -4,55 +4,76 @@ import numpy as np
from pdf import get_pdf_info from pdf import get_pdf_info
def overlap(l): def overlap(l):
merged = [] merged = []
for higher in l: for higher in l:
if not merged: if not merged:
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if higher[0] >= lower[0] and higher[1] <= lower[1]: if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
else: else:
merged.append(higher) merged.append(higher)
return merged return merged
def get_row_idx(t, rows): def get_row_idx(t, rows):
for r in range(len(rows)): for r in range(len(rows)):
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
return r return r
def get_column_idx(t, columns): def get_column_idx(t, columns):
for c in range(len(columns)): for c in range(len(columns)):
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
return c return c
def basic(pdf_dir, filename): def basic(pdf_dir, filename):
print "working on", filename print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
rows, columns = [], [] text.sort(key=lambda x: (-x.y0, x.x0))
for t in text: y_last = 0
rows.append((t.y1, t.y0)) data = []
columns.append((t.x0, t.x1)) temp = []
rows = list(set(rows)) elements = []
rows = sorted(rows, reverse=True) for t in text:
columns = list(set(columns)) # is checking for upright necessary?
columns = sorted(columns) # if t.get_text().strip() and all([obj.upright for obj in t._objs if
columns = overlap(columns) # type(obj) is LTChar]):
table = [['' for c in columns] for r in rows] if t.get_text().strip():
for t in text: if not np.isclose(y_last, t.y0, atol=2):
r_idx = get_row_idx(t, rows) y_last = t.y0
c_idx = get_column_idx(t, columns) elements.append(len(temp))
if None in [r_idx, c_idx]: data.append(temp)
print t temp = []
else: temp.append(t)
table[r_idx][c_idx] = t.get_text().strip('\n') # a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
mode = max(set(elements), key=elements.count)
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
columns = overlap(sorted(columns))
columns = [(c[0] + c[1]) / 2.0 for c in columns]
csvname = filename.split('.')[0] + '.csv' output = [['' for c in columns] for d in data]
csvpath = os.path.join(pdf_dir, csvname) for row, d in enumerate(data):
with open(csvpath, 'w') as outfile: for t in d:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) cog = (t.x0 + t.x1) / 2.0
for cell in table: diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
writer.writerow([ce for ce in cell]) idx = min(diff, key=lambda x: x[1])
if output[row][idx[0]]:
output[row][idx[0]] += ' ' + t.get_text().strip()
else:
output[row][idx[0]] = t.get_text().strip()
csvname = filename.split('.')[0] + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for row in output:
writer.writerow([cell.encode('utf-8') for cell in row])

View File

@ -12,90 +12,118 @@ from spreadsheet import spreadsheet
pno = re.compile(r'\d+') pno = re.compile(r'\d+')
def mkdir(directory): def mkdir(directory):
if not os.path.isdir(directory): if not os.path.isdir(directory):
os.makedirs(directory) os.makedirs(directory)
def filesort(filename): def filesort(filename):
filename = filename.split('/')[-1] filename = filename.split('/')[-1]
num = pno.findall(filename) num = pno.findall(filename)
if len(num) == 2: if len(num) == 2:
return (int(num[0]), int(num[1])) return (int(num[0]), int(num[1]))
else: else:
return (int(num[0]), 0) return (int(num[0]), 0)
start_time = time.time() start_time = time.time()
CAMELOT_DIR = '.camelot/' CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR) mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') parser = argparse.ArgumentParser(
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")') description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"]) parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)') help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None) parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
help='Extract tables with ruling lines. (default: False)')
parser.add_argument('-F', '--fill', action='store', dest='fill',
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('-j', '--jtol', nargs='?', action='store',
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
parser.add_argument('-M', '--mtol', nargs='?', action='store',
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
parser.add_argument('-i', '--invert', action='store_true', dest='invert',
help='Make sure lines are in foreground. (default: False)')
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
help='Specify output directory.')
parser.add_argument('file', nargs=1) parser.add_argument('file', nargs=1)
result = parser.parse_args() result = parser.parse_args()
if result.pages: if result.pages:
if result.pages == ['all']: if result.pages == ['all']:
p = result.pages p = result.pages
else: else:
p = [] p = []
for r in result.pages[0].split(' '): for r in result.pages[0].split(' '):
if '-' in r: if '-' in r:
a, b = r.split('-') a, b = r.split('-')
a, b = int(a), int(b) a, b = int(a), int(b)
p.extend([str(i) for i in range(a, b + 1)]) p.extend([str(i) for i in range(a, b + 1)])
else: else:
p.extend([str(r)]) p.extend([str(r)])
else: else:
p = ['1'] p = ['1']
p = sorted(set(p)) p = sorted(set(p))
s = result.spreadsheet
pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
mkdir(pdf_dir)
filename = result.file[0].split('/')[-1] filename = result.file[0].split('/')[-1]
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG) # pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
mkdir(pdf_dir)
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
0] + '.log'), filemode='w', level=logging.DEBUG)
shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages" print "separating pdf into pages"
print print
if p == ['all']: if p == ['all']:
subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) subprocess.call(['pdfseparate', os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
else: else:
for page in p: for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
if s: if result.spreadsheet:
print "using the spreadsheet method" print "using the spreadsheet method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image" print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) os.system(' '.join(['convert', '-density', '300',
spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale) g, '-depth', '8', g[:-4] + '.png']))
try:
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
result.jtol, result.mtol, result.invert, result.debug)
except:
logging.error("Couldn't parse " + g.split('/')[-1])
print "Couldn't parse", g.split('/')[-1]
else: else:
print "using the basic method" print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
basic(pdf_dir, g.split('/')[-1]) basic(pdf_dir, g.split('/')[-1])
if result.format == ['xlsx']: if result.format == ['xlsx']:
import csv import csv
from pyexcel_xlsx import save_data from pyexcel_xlsx import save_data
from collections import OrderedDict from collections import OrderedDict
data = OrderedDict() data = OrderedDict()
for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort): for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
print "adding", c.split('/')[-1], "to excel file" print "adding", c.split('/')[-1], "to excel file"
with open(c, 'r') as csvfile: with open(c, 'r') as csvfile:
reader = csv.reader(csvfile) reader = csv.reader(csvfile)
data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]}) data.update({c.split('/')[-1].split('.')
xlsxname = filename.split('.')[0] + '.xlsx' [0]: [row for row in reader]})
xlsxpath = os.path.join(pdf_dir, xlsxname) xlsxname = filename.split('.')[0] + '.xlsx'
save_data(xlsxpath, data) xlsxpath = os.path.join(pdf_dir, xlsxname)
print save_data(xlsxpath, data)
print "saved as", xlsxname print
print "saved as", xlsxname
print "finished in", time.time() - start_time, "seconds" print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds") logging.info("Time taken for " + filename + ": " +
str(time.time() - start_time) + " seconds")

39
cell.py
View File

@ -1,23 +1,24 @@
class Cell: class Cell:
def __init__(self, x1, y1, x2, y2):
self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.text = ''
self.spanning_h = False
self.spanning_v = False
def add_text(self, text): def __init__(self, x1, y1, x2, y2):
self.text += text self.lb = (x1, y1)
self.lt = (x1, y2)
self.rb = (x2, y1)
self.rt = (x2, y2)
self.bbox = (x1, y1, x2, y2)
self.left = False
self.right = False
self.top = False
self.bottom = False
self.text = ''
self.spanning_h = False
self.spanning_v = False
def get_text(self): def add_text(self, text):
return self.text self.text += text
def get_bounded_edges(self): def get_text(self):
return self.top + self.bottom + self.left + self.right return self.text
def get_bounded_edges(self):
return self.top + self.bottom + self.left + self.right

View File

@ -1,73 +1,75 @@
import cv2 import cv2
import numpy as np import numpy as np
def transform(x, y, img_x, img_y, pdf_x, pdf_y):
x *= pdf_x / float(img_x)
y = abs(y - img_y)
y *= pdf_y / float(img_y)
return x, y
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ def morph_transform(imagename, s, invert):
def morph(imagename, p_x, p_y, s): # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
img = cv2.imread(imagename) img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0] img_x, img_y = img.shape[1], img.shape[0]
pdf_x, pdf_y = p_x, p_y gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # empirical result taken from
# empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) if invert:
vertical = threshold threshold = cv2.adaptiveThreshold(
horizontal = threshold gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
else:
threshold = cv2.adaptiveThreshold(np.invert(
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
vertical = threshold
horizontal = threshold
scale = s scale = s
verticalsize = vertical.shape[0] / scale verticalsize = vertical.shape[0] / scale
horizontalsize = horizontal.shape[1] / scale horizontalsize = horizontal.shape[1] / scale
ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
vertical = cv2.erode(vertical, ver, (-1, -1)) vertical = cv2.erode(vertical, ver, (-1, -1))
vertical = cv2.dilate(vertical, ver, (-1, -1)) vertical = cv2.dilate(vertical, ver, (-1, -1))
horizontal = cv2.erode(horizontal, hor, (-1, -1)) horizontal = cv2.erode(horizontal, hor, (-1, -1))
horizontal = cv2.dilate(horizontal, hor, (-1, -1)) horizontal = cv2.dilate(horizontal, hor, (-1, -1))
mask = vertical + horizontal mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal) joints = np.bitwise_and(vertical, horizontal)
_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, contours, _ = cv2.findContours(
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {} tables = {}
for c in contours: for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True) c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly) x, y, w, h = cv2.boundingRect(c_poly)
# find number of non-zero values in joints using what boundingRect returns # find number of non-zero values in joints using what boundingRect
roi = joints[y:y+h, x:x+w] # returns
_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) roi = joints[y:y + h, x:x + w]
if len(jc) <= 4: # remove contours with less than <=4 joints _, jc, _ = cv2.findContours(
continue roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
joint_coords = [] if len(jc) <= 4: # remove contours with less than <=4 joints
for j in jc: continue
jx, jy, jw, jh = cv2.boundingRect(j) joint_coords = []
c1, c2 = x + (2*jx + jw) / 2, y + (2*jy + jh) / 2 for j in jc:
c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y) jx, jy, jw, jh = cv2.boundingRect(j)
joint_coords.append((c1, c2)) c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) joint_coords.append((c1, c2))
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) tables[(x, y + h, x + w, y)] = joint_coords
tables[(x1, y2, x2, y1)] = joint_coords
v_segments, h_segments = [], [] v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, vcontours, _ = cv2.findContours(
for vc in vcontours: vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
x, y, w, h = cv2.boundingRect(vc) for vc in vcontours:
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x, y, w, h = cv2.boundingRect(vc)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) x1, x2 = x, x + w
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, hcontours, _ = cv2.findContours(
for hc in hcontours: horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
x, y, w, h = cv2.boundingRect(hc) for hc in hcontours:
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x, y, w, h = cv2.boundingRect(hc)
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) x1, x2 = x, x + w
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return tables, v_segments, h_segments return tables, v_segments, h_segments

84
pdf.py
View File

@ -8,47 +8,51 @@ from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
text = []
def parse_text_basic(layout): def parse_text_basic(layout, t=None):
global text if t is None:
try: t = []
for obj in layout._objs: try:
if type(obj) is LTTextLineHorizontal: for obj in layout._objs:
text.append(obj) if type(obj) is LTTextLineHorizontal:
parse_text_basic(obj) t.append(obj)
except AttributeError: else:
pass t += parse_text_basic(obj)
except AttributeError:
pass
return t
def parse_text_spreadsheet(layout, t=None):
if t is None:
t = []
try:
for obj in layout._objs:
if type(obj) is LTChar:
t.append(obj)
else:
t += parse_text_spreadsheet(obj)
except AttributeError:
pass
return t
def parse_text_spreadsheet(layout):
global text
try:
for obj in layout._objs:
if type(obj) is LTChar:
text.append(obj)
parse_text_spreadsheet(obj)
except AttributeError:
pass
def get_pdf_info(pdfname, method): def get_pdf_info(pdfname, method):
global text with open(pdfname, 'r') as f:
with open(pdfname, 'r') as f: parser = PDFParser(f)
parser = PDFParser(f) document = PDFDocument(parser)
document = PDFDocument(parser) if not document.is_extractable:
if not document.is_extractable: raise PDFTextExtractionNotAllowed
raise PDFTextExtractionNotAllowed laparams = LAParams()
laparams = LAParams() rsrcmgr = PDFResourceManager()
rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams)
device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document):
for page in PDFPage.create_pages(document): interpreter.process_page(page)
interpreter.process_page(page) layout = device.get_result()
layout = device.get_result() if method == 'basic':
text = [] text = parse_text_basic(layout)
if method == 'basic': elif method == 'spreadsheet':
parse_text_basic(layout) text = parse_text_spreadsheet(layout)
elif method == 'spreadsheet': pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
parse_text_spreadsheet(layout) return text, pdf_x, pdf_y
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
text.sort(key=lambda x: (-x.y0, x.x0))
return text, pdf_x, pdf_y

View File

@ -1,129 +1,175 @@
import os import os
import csv import csv
import cv2
import glob import glob
import numpy as np import numpy as np
import matplotlib.pyplot as plt
from table import Table from table import Table
from pdf import get_pdf_info from pdf import get_pdf_info
from morph_transform import morph from morph_transform import morph_transform
from utils import (translate, scale, merge_close_values, get_row_idx,
get_column_idx, reduce_index, outline, fill, remove_empty)
def remove_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=2):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar): def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
ret = [] if debug:
for a in ar: import matplotlib.pyplot as plt
if not ret: import matplotlib.patches as patches
ret.append(a) print "working on", filename
else: imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
temp = ret[-1] img = cv2.imread(imagename)
if np.isclose(temp, a, atol=2): img_x, img_y = img.shape[1], img.shape[0]
temp = (temp + a) / 2.0 text, pdf_x, pdf_y = get_pdf_info(
ret[-1] = temp os.path.join(pdf_dir, filename), 'spreadsheet')
else: scaling_factor_x = pdf_x / float(img_x)
ret.append(a) scaling_factor_y = pdf_y / float(img_y)
return ret tables, v_segments, h_segments = morph_transform(imagename, s, invert)
def get_row_idx(t, rows): if debug == ["contours"]:
for r in range(len(rows)): for t in tables.keys():
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]: cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
return r plt.imshow(img)
if debug == ["joints"]:
x_coord = []
y_coord = []
for k in tables.keys():
for coord in tables[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
def get_column_idx(t, columns): # detect if vertical
for c in range(len(columns)): num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]: num_h = [t for t in text if t.upright and t.get_text().strip()]
return c vger = len(num_v) / float(len(num_v) + len(num_h))
rotated = ''
if vger > 0.8:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
rotated = 'left' if clockwise < anticlockwise else 'right'
def reduce_index(t, r_idx, c_idx): tables_new = {}
if t.cells[r_idx][c_idx].spanning_h: for k in tables.keys():
while not t.cells[r_idx][c_idx].left: x1, y1, x2, y2 = k
c_idx -= 1 x1 = scale(x1, scaling_factor_x)
if t.cells[r_idx][c_idx].spanning_v: y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
while not t.cells[r_idx][c_idx].top: x2 = scale(x2, scaling_factor_x)
r_idx -= 1 y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
return r_idx, c_idx j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
def fill(t, orientation): v_segments_new = []
if orientation == "h": for v in v_segments:
for i in range(len(t.cells)): x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
for j in range(len(t.cells[i])): y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
if t.cells[i][j].get_text().strip() == '': abs(translate(-img_y, v[3])), scaling_factor_y)
if t.cells[i][j].spanning_h: v_segments_new.append((x1, y1, x2, y2))
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif orientation == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif orientation == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def spreadsheet(pdf_dir, filename, orientation, scale): h_segments_new = []
print "working on", filename for h in h_segments:
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
num_tables = 0 num_tables = 0
for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord # sort tables based on y-coord
# find rows and columns that lie in table for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
lb = (k[0], k[1]) # find rows and columns that lie in table
rt = (k[2], k[3]) lb = (k[0], k[1])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] rt = (k[2], k[3])
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] v_s = [v for v in v_segments_new if v[1] > lb[1] - 2 and v[3]
columns, rows = zip(*tables[k]) < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
# sort horizontal and vertical segments h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
columns = merge_close_values(sorted(list(columns))) < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
rows = merge_close_values(sorted(list(rows), reverse=True))
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
table = Table(columns, rows) if debug == ["lines"]:
# pass row and column line segments to table method and light up cell edges for v in v_s:
table = table.set_edges(v_s, h_s) plt.plot([v[0], v[2]], [v[1], v[3]])
# table set span method for h in h_s:
table = table.set_spanning() plt.plot([h[0], h[2]], [h[1], h[3]])
# fill text after sorting it
text.sort(key=lambda x: (-x.y0, x.x0))
for t in text: columns, rows = zip(*tables_new[k])
r_idx = get_row_idx(t, rows) columns, rows = list(columns), list(rows)
c_idx = get_column_idx(t, columns) columns.extend([lb[0], rt[0]])
if None in [r_idx, c_idx]: rows.extend([lb[1], rt[1]])
pass # sort horizontal and vertical segments
else: columns = merge_close_values(sorted(columns), mtol)
r_idx, c_idx = reduce_index(table, r_idx, c_idx) rows = merge_close_values(sorted(rows, reverse=True), mtol)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) # make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1])
for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
if orientation: table = Table(columns, rows)
table = fill(table, orientation) # light up cell edges
table = table.set_edges(v_s, h_s, jtol)
# table set span method
table = table.set_spanning()
# TODO
table = outline(table)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' if debug == ["tables"]:
csvpath = os.path.join(pdf_dir, csvname) for i in range(len(table.cells)):
with open(csvpath, 'w') as outfile: for j in range(len(table.cells[i])):
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) if table.cells[i][j].left:
for i in range(len(table.cells)): plt.plot([table.cells[i][j].lb[0], table.cells[i][j].lt[0]],
writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))]) [table.cells[i][j].lb[1], table.cells[i][j].lt[1]])
print "saved as", csvname if table.cells[i][j].right:
print plt.plot([table.cells[i][j].rb[0], table.cells[i][j].rt[0]],
num_tables += 1 [table.cells[i][j].rb[1], table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0], table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1], table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
if debug:
plt.show()
# fill text after sorting it
if not rotated:
text.sort(key=lambda x: (-x.y0, x.x0))
elif rotated == 'left':
text.sort(key=lambda x: (x.x0, x.y0))
elif rotated == 'right':
text.sort(key=lambda x: (-x.x0, -x.y0))
for t in text:
r_idx = get_row_idx(t, rows)
c_idx = get_column_idx(t, columns)
if None in [r_idx, c_idx]:
pass
else:
r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if fill:
table = fill(table, fill)
data = []
for i in range(len(table.cells)):
data.append([table.cells[i][j].get_text().strip().encode('utf-8')
for j in range(len(table.cells[i]))])
if rotated == 'left':
data = zip(*data[::-1])
elif rotated == 'right':
data = zip(*data[::1])
data.reverse()
data = remove_empty(data)
csvname = filename.split(
'.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for d in data:
writer.writerow(d)
print "saved as", csvname
print
num_tables += 1

287
table.py
View File

@ -1,151 +1,152 @@
import numpy as np import numpy as np
from cell import Cell from cell import Cell
class Table: class Table:
def __init__(self, columns, rows):
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows]
self.columns = columns
self.rows = rows
def set_edges(self, vertical, horizontal): def __init__(self, columns, rows):
for v in vertical: self.cells = [[Cell(c[0], r[1], c[1], r[0])
# find closest x coord for c in columns] for r in rows]
# iterate over y coords and find closest points self.columns = columns
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)] self.rows = rows
j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)]
k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)]
if not j:
continue
if i == [0]: # only left edge
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].left = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].left = True
J += 1
elif i == []: # only right edge
if k:
I = len(self.columns) - 1
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].right = True
J += 1
else:
I = len(self.columns) - 1
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].right = True
J += 1
else: # both left and right edges
if k:
I = i[0]
J = j[0]
K = k[0]
while J < K:
self.cells[J][I].left = True
self.cells[J][I - 1].right = True
J += 1
else:
I = i[0]
J = j[0]
K = len(self.rows)
while J < K:
self.cells[J][I].left = True
self.cells[J][I - 1].right = True
J += 1
for h in horizontal: def set_edges(self, vertical, horizontal, jtol):
# find closest y coord for v in vertical:
# iterate over x coords and find closest points # find closest x coord
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)] # iterate over y coords and find closest points
j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)] i = [i for i, t in enumerate(self.columns)
k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)] if np.isclose(v[0], t[0], atol=jtol)]
if not j: j = [j for j, t in enumerate(self.rows)
continue if np.isclose(v[3], t[0], atol=jtol)]
if i == [0]: # only top edge k = [k for k, t in enumerate(self.rows)
if k: if np.isclose(v[1], t[0], atol=jtol)]
I = i[0] if not j:
J = j[0] continue
K = k[0] J = j[0]
while J < K: if i == [0]: # only left edge
self.cells[I][J].top = True I = i[0]
J += 1 if k:
else: K = k[0]
I = i[0] while J < K:
J = j[0] self.cells[J][I].left = True
K = len(self.columns) J += 1
while J < K: else:
self.cells[I][J].top = True K = len(self.rows)
J += 1 while J < K:
elif i == []: # only bottom edge self.cells[J][I].left = True
if k: J += 1
I = len(self.rows) - 1 elif i == []: # only right edge
J = j[0] I = len(self.columns) - 1
K = k[0] if k:
while J < K: K = k[0]
self.cells[I][J].bottom = True while J < K:
J += 1 self.cells[J][I].right = True
else: J += 1
I = len(self.rows) - 1 else:
J = j[0] K = len(self.rows)
K = len(self.columns) while J < K:
while J < K: self.cells[J][I].right = True
self.cells[I][J].bottom = True J += 1
J += 1 else: # both left and right edges
else: # both top and bottom edges I = i[0]
if k: if k:
I = i[0] K = k[0]
J = j[0] while J < K:
K = k[0] self.cells[J][I].left = True
while J < K: self.cells[J][I - 1].right = True
self.cells[I][J].top = True J += 1
self.cells[I - 1][J].bottom = True else:
J += 1 K = len(self.rows)
else: while J < K:
I = i[0] self.cells[J][I].left = True
J = j[0] self.cells[J][I - 1].right = True
K = len(self.columns) J += 1
while J < K:
self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True
J += 1
return self for h in horizontal:
# find closest y coord
# iterate over x coords and find closest points
i = [i for i, t in enumerate(self.rows)
if np.isclose(h[1], t[0], atol=jtol)]
j = [j for j, t in enumerate(self.columns)
if np.isclose(h[0], t[0], atol=jtol)]
k = [k for k, t in enumerate(self.columns)
if np.isclose(h[2], t[0], atol=jtol)]
if not j:
continue
J = j[0]
if i == [0]: # only top edge
I = i[0]
if k:
K = k[0]
while J < K:
self.cells[I][J].top = True
J += 1
else:
K = len(self.columns)
while J < K:
self.cells[I][J].top = True
J += 1
elif i == []: # only bottom edge
I = len(self.rows) - 1
if k:
K = k[0]
while J < K:
self.cells[I][J].bottom = True
J += 1
else:
K = len(self.columns)
while J < K:
self.cells[I][J].bottom = True
J += 1
else: # both top and bottom edges
I = i[0]
if k:
K = k[0]
while J < K:
self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True
J += 1
else:
K = len(self.columns)
while J < K:
self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True
J += 1
def set_spanning(self): return self
for i in range(len(self.cells)):
for j in range(len(self.cells[i])): def set_spanning(self):
bound = self.cells[i][j].get_bounded_edges() for i in range(len(self.cells)):
if bound == 4: for j in range(len(self.cells[i])):
continue bound = self.cells[i][j].get_bounded_edges()
elif bound == 3: if bound == 4:
if not self.cells[i][j].left: continue
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True elif bound == 3:
elif not self.cells[i][j].right: if not self.cells[i][j].left:
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom: if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True self.cells[i][j].spanning_h = True
elif not self.cells[i][j].top:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom: elif not self.cells[i][j].right:
self.cells[i][j].spanning_v = True if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
elif not self.cells[i][j].bottom: self.cells[i][j].spanning_h = True
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
self.cells[i][j].spanning_v = True elif not self.cells[i][j].top:
elif bound == 2: if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
if self.cells[i][j].left and self.cells[i][j].right: self.cells[i][j].spanning_v = True
if not self.cells[i][j].top and not self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True elif not self.cells[i][j].bottom:
elif self.cells[i][j].top and self.cells[i][j].bottom: if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
if not self.cells[i][j].left and not self.cells[i][j].right: self.cells[i][j].spanning_v = True
self.cells[i][j].spanning_h = True
return self elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right:
if not self.cells[i][j].top and not self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom:
if not self.cells[i][j].left and not self.cells[i][j].right:
self.cells[i][j].spanning_h = True
return self

133
utils.py 100644
View File

@ -0,0 +1,133 @@
import numpy as np
def translate(x1, x2):
x2 += x1
return x2
def scale(x, s):
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew
def remove_close_values(ar, mtol):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar, mtol):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, rotated, r_idx, c_idx):
if not rotated:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
elif rotated == 'left':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
elif rotated == 'right':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def outline(t):
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
for i in range(len(t.cells[0])):
t.cells[0][i].top = True
t.cells[len(t.cells) - 1][i].bottom = True
return t
def fill(t, f):
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif f == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def remove_empty(d):
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d