Make code PEP8 compliant

pull/2/head
Vinayak Mehta 2016-07-11 15:19:38 +05:30
parent f6869a9af4
commit b87d2350dc
9 changed files with 765 additions and 489 deletions

View File

@ -1,30 +1,70 @@
Camelot Camelot
------- -------
usage: python2 camelot.py [options] pdf_file Description: Parse tables from pdfs!
Parse yo pdf! Dependencies
Install
Usage: python2 camelot.py [options] file
positional arguments: positional arguments:
file file
optional arguments: optional arguments:
-h, --help show this help message and exit
-p PAGES [PAGES ...] Specify the page numbers and/or page ranges to be -h, --help
parsed. Example: -p="1 3-5 9", -p="all" (default:
-p="1")
-f FORMAT Output format (csv/xlsx). Example: -f="xlsx" (default: show this help message and exit
-f="csv")
-spreadsheet Extract data stored in pdfs with ruling lines. -p, --pages PAGES [PAGES ...]
(default: False)
-F ORIENTATION Fill the values in empty cells. Example: -F="h", Specify the page numbers and/or page ranges to be
-F="v", -F="hv" (default: None) parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-s [SCALE] Scaling factor. Large scaling factor leads to smaller -f, --format FORMAT
Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-m, --spreadsheet
Extract tables with ruling lines. (default: False)
-F, --fill FILL
Fill the values in empty cells horizontally(h) and/or
vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-s, --scale [SCALE]
Scaling factor. Large scaling factor leads to smaller
lines being detected. (default: 15) lines being detected. (default: 15)
Under construction... -j, --jtol [JTOL]
Tolerance to account for when comparing joint and line
coordinates. (default: 2)
-M, --mtol [MTOL]
Tolerance to account for when merging lines which are
very close. (default: 2)
-i, --invert
Make sure lines are in foreground. (default: False)
-d, --debug DEBUG
Debug by visualizing contours, lines, joints, tables.
Example: --debug="contours"
-o, --output OUTPUT
Specify output directory.
Development: Code, Contributing, Tests
License

View File

@ -4,6 +4,7 @@ import numpy as np
from pdf import get_pdf_info from pdf import get_pdf_info
def overlap(l): def overlap(l):
merged = [] merged = []
for higher in l: for higher in l:
@ -11,7 +12,7 @@ def overlap(l):
merged.append(higher) merged.append(higher)
else: else:
lower = merged[-1] lower = merged[-1]
if higher[0] >= lower[0] and higher[1] <= lower[1]: if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1]) upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0]) lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound) merged[-1] = (lower_bound, upper_bound)
@ -19,40 +20,60 @@ def overlap(l):
merged.append(higher) merged.append(higher)
return merged return merged
def get_row_idx(t, rows): def get_row_idx(t, rows):
for r in range(len(rows)): for r in range(len(rows)):
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
return r return r
def get_column_idx(t, columns): def get_column_idx(t, columns):
for c in range(len(columns)): for c in range(len(columns)):
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
return c return c
def basic(pdf_dir, filename): def basic(pdf_dir, filename):
print "working on", filename print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic') text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic')
rows, columns = [], [] text.sort(key=lambda x: (-x.y0, x.x0))
y_last = 0
data = []
temp = []
elements = []
for t in text: for t in text:
rows.append((t.y1, t.y0)) # is checking for upright necessary?
columns.append((t.x0, t.x1)) # if t.get_text().strip() and all([obj.upright for obj in t._objs if
rows = list(set(rows)) # type(obj) is LTChar]):
rows = sorted(rows, reverse=True) if t.get_text().strip():
columns = list(set(columns)) if not np.isclose(y_last, t.y0, atol=2):
columns = sorted(columns) y_last = t.y0
columns = overlap(columns) elements.append(len(temp))
table = [['' for c in columns] for r in rows] data.append(temp)
for t in text: temp = []
r_idx = get_row_idx(t, rows) temp.append(t)
c_idx = get_column_idx(t, columns) # a table can't have just 1 column, can it?
if None in [r_idx, c_idx]: elements = filter(lambda x: x != 1, elements)
print t # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
mode = max(set(elements), key=elements.count)
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
columns = overlap(sorted(columns))
columns = [(c[0] + c[1]) / 2.0 for c in columns]
output = [['' for c in columns] for d in data]
for row, d in enumerate(data):
for t in d:
cog = (t.x0 + t.x1) / 2.0
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
idx = min(diff, key=lambda x: x[1])
if output[row][idx[0]]:
output[row][idx[0]] += ' ' + t.get_text().strip()
else: else:
table[r_idx][c_idx] = t.get_text().strip('\n') output[row][idx[0]] = t.get_text().strip()
csvname = filename.split('.')[0] + '.csv' csvname = filename.split('.')[0] + '.csv'
csvpath = os.path.join(pdf_dir, csvname) csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile: with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for cell in table: for row in output:
writer.writerow([ce for ce in cell]) writer.writerow([cell.encode('utf-8') for cell in row])

View File

@ -12,10 +12,12 @@ from spreadsheet import spreadsheet
pno = re.compile(r'\d+') pno = re.compile(r'\d+')
def mkdir(directory): def mkdir(directory):
if not os.path.isdir(directory): if not os.path.isdir(directory):
os.makedirs(directory) os.makedirs(directory)
def filesort(filename): def filesort(filename):
filename = filename.split('/')[-1] filename = filename.split('/')[-1]
num = pno.findall(filename) num = pno.findall(filename)
@ -28,12 +30,28 @@ start_time = time.time()
CAMELOT_DIR = '.camelot/' CAMELOT_DIR = '.camelot/'
mkdir(CAMELOT_DIR) mkdir(CAMELOT_DIR)
parser = argparse.ArgumentParser(description='Parse yo pdf!', usage='python2 camelot.py [options] pdf_file') parser = argparse.ArgumentParser(
parser.add_argument('-p', nargs='+', action='store', dest='pages', help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: -p="1")') description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
parser.add_argument('-f', nargs=1, action='store', dest='format', help='Output format (csv/xlsx). Example: -f="xlsx" (default: -f="csv")', default=["csv"]) parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
parser.add_argument('-spreadsheet', action='store_true', dest='spreadsheet', help='Extract data stored in pdfs with ruling lines. (default: False)') help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
parser.add_argument('-F', action='store', dest='orientation', help='Fill the values in empty cells. Example: -F="h", -F="v", -F="hv" (default: None)', default=None) parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
parser.add_argument('-s', nargs='?', action='store', dest='scale', help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
parser.add_argument('-m', '--spreadsheet', action='store_true', dest='spreadsheet',
help='Extract tables with ruling lines. (default: False)')
parser.add_argument('-F', '--fill', action='store', dest='fill',
help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
parser.add_argument('-s', '--scale', nargs='?', action='store', dest='scale',
help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
parser.add_argument('-j', '--jtol', nargs='?', action='store',
dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
parser.add_argument('-M', '--mtol', nargs='?', action='store',
dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
parser.add_argument('-i', '--invert', action='store_true', dest='invert',
help='Make sure lines are in foreground. (default: False)')
parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
help='Specify output directory.')
parser.add_argument('file', nargs=1) parser.add_argument('file', nargs=1)
result = parser.parse_args() result = parser.parse_args()
@ -54,28 +72,36 @@ else:
p = ['1'] p = ['1']
p = sorted(set(p)) p = sorted(set(p))
s = result.spreadsheet
pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
mkdir(pdf_dir)
filename = result.file[0].split('/')[-1] filename = result.file[0].split('/')[-1]
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[0] + '.log'), filemode='w', level=logging.DEBUG) # pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
mkdir(pdf_dir)
logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
0] + '.log'), filemode='w', level=logging.DEBUG)
shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
print "separating pdf into pages" print "separating pdf into pages"
print print
if p == ['all']: if p == ['all']:
subprocess.call(['pdfseparate', os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) subprocess.call(['pdfseparate', os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
else: else:
for page in p: for page in p:
subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
if s: if result.spreadsheet:
print "using the spreadsheet method" print "using the spreadsheet method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
print "converting", g.split('/')[-1], "to image" print "converting", g.split('/')[-1], "to image"
os.system(' '.join(['convert', '-density', '300', g, '-depth', '8', g[:-4] + '.png'])) os.system(' '.join(['convert', '-density', '300',
spreadsheet(pdf_dir, g.split('/')[-1], result.orientation, result.scale) g, '-depth', '8', g[:-4] + '.png']))
try:
spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
result.jtol, result.mtol, result.invert, result.debug)
except:
logging.error("Couldn't parse " + g.split('/')[-1])
print "Couldn't parse", g.split('/')[-1]
else: else:
print "using the basic method" print "using the basic method"
for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
@ -90,7 +116,8 @@ if result.format == ['xlsx']:
print "adding", c.split('/')[-1], "to excel file" print "adding", c.split('/')[-1], "to excel file"
with open(c, 'r') as csvfile: with open(c, 'r') as csvfile:
reader = csv.reader(csvfile) reader = csv.reader(csvfile)
data.update({c.split('/')[-1].split('.')[0]: [row for row in reader]}) data.update({c.split('/')[-1].split('.')
[0]: [row for row in reader]})
xlsxname = filename.split('.')[0] + '.xlsx' xlsxname = filename.split('.')[0] + '.xlsx'
xlsxpath = os.path.join(pdf_dir, xlsxname) xlsxpath = os.path.join(pdf_dir, xlsxname)
save_data(xlsxpath, data) save_data(xlsxpath, data)
@ -98,4 +125,5 @@ if result.format == ['xlsx']:
print "saved as", xlsxname print "saved as", xlsxname
print "finished in", time.time() - start_time, "seconds" print "finished in", time.time() - start_time, "seconds"
logging.info("Time taken for " + filename + ": " + str(time.time() - start_time) + " seconds") logging.info("Time taken for " + filename + ": " +
str(time.time() - start_time) + " seconds")

View File

@ -1,4 +1,5 @@
class Cell: class Cell:
def __init__(self, x1, y1, x2, y2): def __init__(self, x1, y1, x2, y2):
self.lb = (x1, y1) self.lb = (x1, y1)
self.lt = (x1, y2) self.lt = (x1, y2)

View File

@ -1,20 +1,20 @@
import cv2 import cv2
import numpy as np import numpy as np
def transform(x, y, img_x, img_y, pdf_x, pdf_y):
x *= pdf_x / float(img_x)
y = abs(y - img_y)
y *= pdf_y / float(img_y)
return x, y
def morph_transform(imagename, s, invert):
# http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
def morph(imagename, p_x, p_y, s):
img = cv2.imread(imagename) img = cv2.imread(imagename)
img_x, img_y = img.shape[1], img.shape[0] img_x, img_y = img.shape[1], img.shape[0]
pdf_x, pdf_y = p_x, p_y
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# empirical result taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf # empirical result taken from
threshold = cv2.adaptiveThreshold(np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
if invert:
threshold = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
else:
threshold = cv2.adaptiveThreshold(np.invert(
gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
vertical = threshold vertical = threshold
horizontal = threshold horizontal = threshold
@ -33,41 +33,43 @@ def morph(imagename, p_x, p_y, s):
mask = vertical + horizontal mask = vertical + horizontal
joints = np.bitwise_and(vertical, horizontal) joints = np.bitwise_and(vertical, horizontal)
_, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, contours, _ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
tables = {} tables = {}
for c in contours: for c in contours:
c_poly = cv2.approxPolyDP(c, 3, True) c_poly = cv2.approxPolyDP(c, 3, True)
x, y, w, h = cv2.boundingRect(c_poly) x, y, w, h = cv2.boundingRect(c_poly)
# find number of non-zero values in joints using what boundingRect returns # find number of non-zero values in joints using what boundingRect
# returns
roi = joints[y:y + h, x:x + w] roi = joints[y:y + h, x:x + w]
_, jc, _ = cv2.findContours(roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) _, jc, _ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than <=4 joints if len(jc) <= 4: # remove contours with less than <=4 joints
continue continue
joint_coords = [] joint_coords = []
for j in jc: for j in jc:
jx, jy, jw, jh = cv2.boundingRect(j) jx, jy, jw, jh = cv2.boundingRect(j)
c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
c1, c2 = transform(c1, c2, img_x, img_y, pdf_x, pdf_y)
joint_coords.append((c1, c2)) joint_coords.append((c1, c2))
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) tables[(x, y + h, x + w, y)] = joint_coords
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y)
tables[(x1, y2, x2, y1)] = joint_coords
v_segments, h_segments = [], [] v_segments, h_segments = [], []
_, vcontours, _ = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, vcontours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for vc in vcontours: for vc in vcontours:
x, y, w, h = cv2.boundingRect(vc) x, y, w, h = cv2.boundingRect(vc)
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x1, x2 = x, x + w
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) y1, y2 = y, y + h
v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
_, hcontours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) _, hcontours, _ = cv2.findContours(
horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for hc in hcontours: for hc in hcontours:
x, y, w, h = cv2.boundingRect(hc) x, y, w, h = cv2.boundingRect(hc)
x1, y1 = transform(x, y, img_x, img_y, pdf_x, pdf_y) x1, x2 = x, x + w
x2, y2 = transform(x + w, y + h, img_x, img_y, pdf_x, pdf_y) y1, y2 = y, y + h
h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
return tables, v_segments, h_segments return tables, v_segments, h_segments

32
pdf.py
View File

@ -8,30 +8,36 @@ from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
text = []
def parse_text_basic(layout): def parse_text_basic(layout, t=None):
global text if t is None:
t = []
try: try:
for obj in layout._objs: for obj in layout._objs:
if type(obj) is LTTextLineHorizontal: if type(obj) is LTTextLineHorizontal:
text.append(obj) t.append(obj)
parse_text_basic(obj) else:
t += parse_text_basic(obj)
except AttributeError: except AttributeError:
pass pass
return t
def parse_text_spreadsheet(layout):
global text def parse_text_spreadsheet(layout, t=None):
if t is None:
t = []
try: try:
for obj in layout._objs: for obj in layout._objs:
if type(obj) is LTChar: if type(obj) is LTChar:
text.append(obj) t.append(obj)
parse_text_spreadsheet(obj) else:
t += parse_text_spreadsheet(obj)
except AttributeError: except AttributeError:
pass pass
return t
def get_pdf_info(pdfname, method): def get_pdf_info(pdfname, method):
global text
with open(pdfname, 'r') as f: with open(pdfname, 'r') as f:
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
@ -44,11 +50,9 @@ def get_pdf_info(pdfname, method):
for page in PDFPage.create_pages(document): for page in PDFPage.create_pages(document):
interpreter.process_page(page) interpreter.process_page(page)
layout = device.get_result() layout = device.get_result()
text = []
if method == 'basic': if method == 'basic':
parse_text_basic(layout) text = parse_text_basic(layout)
elif method == 'spreadsheet': elif method == 'spreadsheet':
parse_text_spreadsheet(layout) text = parse_text_spreadsheet(layout)
pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
text.sort(key=lambda x: (-x.y0, x.x0))
return text, pdf_x, pdf_y return text, pdf_x, pdf_y

View File

@ -1,110 +1,145 @@
import os import os
import csv import csv
import cv2
import glob import glob
import numpy as np import numpy as np
import matplotlib.pyplot as plt
from table import Table from table import Table
from pdf import get_pdf_info from pdf import get_pdf_info
from morph_transform import morph from morph_transform import morph_transform
from utils import (translate, scale, merge_close_values, get_row_idx,
get_column_idx, reduce_index, outline, fill, remove_empty)
def remove_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=2):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar): def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug):
ret = [] if debug:
for a in ar: import matplotlib.pyplot as plt
if not ret: import matplotlib.patches as patches
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=2):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, r_idx, c_idx):
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def fill(t, orientation):
if orientation == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif orientation == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif orientation == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def spreadsheet(pdf_dir, filename, orientation, scale):
print "working on", filename print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet') img = cv2.imread(imagename)
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale) img_x, img_y = img.shape[1], img.shape[0]
text, pdf_x, pdf_y = get_pdf_info(
os.path.join(pdf_dir, filename), 'spreadsheet')
scaling_factor_x = pdf_x / float(img_x)
scaling_factor_y = pdf_y / float(img_y)
tables, v_segments, h_segments = morph_transform(imagename, s, invert)
if debug == ["contours"]:
for t in tables.keys():
cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
plt.imshow(img)
if debug == ["joints"]:
x_coord = []
y_coord = []
for k in tables.keys():
for coord in tables[k]:
x_coord.append(coord[0])
y_coord.append(coord[1])
max_x, max_y = max(x_coord), max(y_coord)
plt.plot(x_coord, y_coord, 'ro')
plt.axis([0, max_x + 100, max_y + 100, 0])
plt.imshow(img)
# detect if vertical
num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
num_h = [t for t in text if t.upright and t.get_text().strip()]
vger = len(num_v) / float(len(num_v) + len(num_h))
rotated = ''
if vger > 0.8:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in text)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in text)
rotated = 'left' if clockwise < anticlockwise else 'right'
tables_new = {}
for k in tables.keys():
x1, y1, x2, y2 = k
x1 = scale(x1, scaling_factor_x)
y1 = scale(abs(translate(-img_y, y1)), scaling_factor_y)
x2 = scale(x2, scaling_factor_x)
y2 = scale(abs(translate(-img_y, y2)), scaling_factor_y)
j_x, j_y = zip(*tables[k])
j_x = [scale(j, scaling_factor_x) for j in j_x]
j_y = [scale(abs(translate(-img_y, j)), scaling_factor_y) for j in j_y]
joints = zip(j_x, j_y)
tables_new[(x1, y1, x2, y2)] = joints
v_segments_new = []
for v in v_segments:
x1, x2 = scale(v[0], scaling_factor_x), scale(v[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, v[1])), scaling_factor_y), scale(
abs(translate(-img_y, v[3])), scaling_factor_y)
v_segments_new.append((x1, y1, x2, y2))
h_segments_new = []
for h in h_segments:
x1, x2 = scale(h[0], scaling_factor_x), scale(h[2], scaling_factor_x)
y1, y2 = scale(abs(translate(-img_y, h[1])), scaling_factor_y), scale(
abs(translate(-img_y, h[3])), scaling_factor_y)
h_segments_new.append((x1, y1, x2, y2))
num_tables = 0 num_tables = 0
for k in sorted(tables.keys(), key=lambda x: x[1], reverse=True): # sort tables based on y-coord # sort tables based on y-coord
for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
# find rows and columns that lie in table # find rows and columns that lie in table
lb = (k[0], k[1]) lb = (k[0], k[1])
rt = (k[2], k[3]) rt = (k[2], k[3])
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2] v_s = [v for v in v_segments_new if v[1] > lb[1] - 2 and v[3]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
columns, rows = zip(*tables[k]) h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
< rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
if debug == ["lines"]:
for v in v_s:
plt.plot([v[0], v[2]], [v[1], v[3]])
for h in h_s:
plt.plot([h[0], h[2]], [h[1], h[3]])
columns, rows = zip(*tables_new[k])
columns, rows = list(columns), list(rows)
columns.extend([lb[0], rt[0]])
rows.extend([lb[1], rt[1]])
# sort horizontal and vertical segments # sort horizontal and vertical segments
columns = merge_close_values(sorted(list(columns))) columns = merge_close_values(sorted(columns), mtol)
rows = merge_close_values(sorted(list(rows), reverse=True)) rows = merge_close_values(sorted(rows, reverse=True), mtol)
# make grid using x and y coord of shortlisted rows and columns # make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] columns = [(columns[i], columns[i + 1])
for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
table = Table(columns, rows) table = Table(columns, rows)
# pass row and column line segments to table method and light up cell edges # light up cell edges
table = table.set_edges(v_s, h_s) table = table.set_edges(v_s, h_s, jtol)
# table set span method # table set span method
table = table.set_spanning() table = table.set_spanning()
# TODO
table = outline(table)
if debug == ["tables"]:
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
if table.cells[i][j].left:
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].lt[0]],
[table.cells[i][j].lb[1], table.cells[i][j].lt[1]])
if table.cells[i][j].right:
plt.plot([table.cells[i][j].rb[0], table.cells[i][j].rt[0]],
[table.cells[i][j].rb[1], table.cells[i][j].rt[1]])
if table.cells[i][j].top:
plt.plot([table.cells[i][j].lt[0], table.cells[i][j].rt[0]],
[table.cells[i][j].lt[1], table.cells[i][j].rt[1]])
if table.cells[i][j].bottom:
plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
[table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
if debug:
plt.show()
# fill text after sorting it # fill text after sorting it
if not rotated:
text.sort(key=lambda x: (-x.y0, x.x0)) text.sort(key=lambda x: (-x.y0, x.x0))
elif rotated == 'left':
text.sort(key=lambda x: (x.x0, x.y0))
elif rotated == 'right':
text.sort(key=lambda x: (-x.x0, -x.y0))
for t in text: for t in text:
r_idx = get_row_idx(t, rows) r_idx = get_row_idx(t, rows)
@ -112,18 +147,29 @@ def spreadsheet(pdf_dir, filename, orientation, scale):
if None in [r_idx, c_idx]: if None in [r_idx, c_idx]:
pass pass
else: else:
r_idx, c_idx = reduce_index(table, r_idx, c_idx) r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if orientation: if fill:
table = fill(table, orientation) table = fill(table, fill)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' data = []
for i in range(len(table.cells)):
data.append([table.cells[i][j].get_text().strip().encode('utf-8')
for j in range(len(table.cells[i]))])
if rotated == 'left':
data = zip(*data[::-1])
elif rotated == 'right':
data = zip(*data[::1])
data.reverse()
data = remove_empty(data)
csvname = filename.split(
'.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname) csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile: with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for i in range(len(table.cells)): for d in data:
writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))]) writer.writerow(d)
print "saved as", csvname print "saved as", csvname
print print
num_tables += 1 num_tables += 1

View File

@ -1,63 +1,62 @@
import numpy as np import numpy as np
from cell import Cell from cell import Cell
class Table: class Table:
def __init__(self, columns, rows): def __init__(self, columns, rows):
self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows] self.cells = [[Cell(c[0], r[1], c[1], r[0])
for c in columns] for r in rows]
self.columns = columns self.columns = columns
self.rows = rows self.rows = rows
def set_edges(self, vertical, horizontal): def set_edges(self, vertical, horizontal, jtol):
for v in vertical: for v in vertical:
# find closest x coord # find closest x coord
# iterate over y coords and find closest points # iterate over y coords and find closest points
i = [i for i, t in enumerate(self.columns) if np.isclose(v[0], t[0], atol=2)] i = [i for i, t in enumerate(self.columns)
j = [j for j, t in enumerate(self.rows) if np.isclose(v[3], t[0], atol=2)] if np.isclose(v[0], t[0], atol=jtol)]
k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=2)] j = [j for j, t in enumerate(self.rows)
if np.isclose(v[3], t[0], atol=jtol)]
k = [k for k, t in enumerate(self.rows)
if np.isclose(v[1], t[0], atol=jtol)]
if not j: if not j:
continue continue
if i == [0]: # only left edge
if k:
I = i[0]
J = j[0] J = j[0]
if i == [0]: # only left edge
I = i[0]
if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][I].left = True
J += 1 J += 1
else: else:
I = i[0]
J = j[0]
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][I].left = True
J += 1 J += 1
elif i == []: # only right edge elif i == []: # only right edge
if k:
I = len(self.columns) - 1 I = len(self.columns) - 1
J = j[0] if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].right = True self.cells[J][I].right = True
J += 1 J += 1
else: else:
I = len(self.columns) - 1
J = j[0]
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].right = True self.cells[J][I].right = True
J += 1 J += 1
else: # both left and right edges else: # both left and right edges
if k:
I = i[0] I = i[0]
J = j[0] if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][I].left = True
self.cells[J][I - 1].right = True self.cells[J][I - 1].right = True
J += 1 J += 1
else: else:
I = i[0]
J = j[0]
K = len(self.rows) K = len(self.rows)
while J < K: while J < K:
self.cells[J][I].left = True self.cells[J][I].left = True
@ -67,53 +66,48 @@ class Table:
for h in horizontal: for h in horizontal:
# find closest y coord # find closest y coord
# iterate over x coords and find closest points # iterate over x coords and find closest points
i = [i for i, t in enumerate(self.rows) if np.isclose(h[1], t[0], atol=2)] i = [i for i, t in enumerate(self.rows)
j = [j for j, t in enumerate(self.columns) if np.isclose(h[0], t[0], atol=2)] if np.isclose(h[1], t[0], atol=jtol)]
k = [k for k, t in enumerate(self.columns) if np.isclose(h[2], t[0], atol=2)] j = [j for j, t in enumerate(self.columns)
if np.isclose(h[0], t[0], atol=jtol)]
k = [k for k, t in enumerate(self.columns)
if np.isclose(h[2], t[0], atol=jtol)]
if not j: if not j:
continue continue
if i == [0]: # only top edge
if k:
I = i[0]
J = j[0] J = j[0]
if i == [0]: # only top edge
I = i[0]
if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].top = True self.cells[I][J].top = True
J += 1 J += 1
else: else:
I = i[0]
J = j[0]
K = len(self.columns) K = len(self.columns)
while J < K: while J < K:
self.cells[I][J].top = True self.cells[I][J].top = True
J += 1 J += 1
elif i == []: # only bottom edge elif i == []: # only bottom edge
if k:
I = len(self.rows) - 1 I = len(self.rows) - 1
J = j[0] if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].bottom = True self.cells[I][J].bottom = True
J += 1 J += 1
else: else:
I = len(self.rows) - 1
J = j[0]
K = len(self.columns) K = len(self.columns)
while J < K: while J < K:
self.cells[I][J].bottom = True self.cells[I][J].bottom = True
J += 1 J += 1
else: # both top and bottom edges else: # both top and bottom edges
if k:
I = i[0] I = i[0]
J = j[0] if k:
K = k[0] K = k[0]
while J < K: while J < K:
self.cells[I][J].top = True self.cells[I][J].top = True
self.cells[I - 1][J].bottom = True self.cells[I - 1][J].bottom = True
J += 1 J += 1
else: else:
I = i[0]
J = j[0]
K = len(self.columns) K = len(self.columns)
while J < K: while J < K:
self.cells[I][J].top = True self.cells[I][J].top = True
@ -128,24 +122,31 @@ class Table:
bound = self.cells[i][j].get_bounded_edges() bound = self.cells[i][j].get_bounded_edges()
if bound == 4: if bound == 4:
continue continue
elif bound == 3: elif bound == 3:
if not self.cells[i][j].left: if not self.cells[i][j].left:
if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom: if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True self.cells[i][j].spanning_h = True
elif not self.cells[i][j].right: elif not self.cells[i][j].right:
if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom: if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
self.cells[i][j].spanning_h = True self.cells[i][j].spanning_h = True
elif not self.cells[i][j].top: elif not self.cells[i][j].top:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom: if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True self.cells[i][j].spanning_v = True
elif not self.cells[i][j].bottom: elif not self.cells[i][j].bottom:
if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top: if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
self.cells[i][j].spanning_v = True self.cells[i][j].spanning_v = True
elif bound == 2: elif bound == 2:
if self.cells[i][j].left and self.cells[i][j].right: if self.cells[i][j].left and self.cells[i][j].right:
if not self.cells[i][j].top and not self.cells[i][j].bottom: if not self.cells[i][j].top and not self.cells[i][j].bottom:
self.cells[i][j].spanning_v = True self.cells[i][j].spanning_v = True
elif self.cells[i][j].top and self.cells[i][j].bottom: elif self.cells[i][j].top and self.cells[i][j].bottom:
if not self.cells[i][j].left and not self.cells[i][j].right: if not self.cells[i][j].left and not self.cells[i][j].right:
self.cells[i][j].spanning_h = True self.cells[i][j].spanning_h = True
return self return self

133
utils.py 100644
View File

@ -0,0 +1,133 @@
import numpy as np
def translate(x1, x2):
x2 += x1
return x2
def scale(x, s):
x *= s
return x
def rotate(x1, y1, x2, y2, angle):
s = np.sin(angle)
c = np.cos(angle)
x2 = translate(-x1, x2)
y2 = translate(-y1, y2)
xnew = c * x2 - s * y2
ynew = s * x2 + c * y2
xnew = translate(x1, xnew)
ynew = translate(y1, ynew)
return xnew, ynew
def remove_close_values(ar, mtol):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
pass
else:
ret.append(a)
return ret
def merge_close_values(ar, mtol):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=mtol):
temp = (temp + a) / 2.0
ret[-1] = temp
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, rotated, r_idx, c_idx):
if not rotated:
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
elif rotated == 'left':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].bottom:
r_idx += 1
elif rotated == 'right':
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].right:
c_idx += 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def outline(t):
for i in range(len(t.cells)):
t.cells[i][0].left = True
t.cells[i][len(t.cells[i]) - 1].right = True
for i in range(len(t.cells[0])):
t.cells[0][i].top = True
t.cells[len(t.cells) - 1][i].bottom = True
return t
def fill(t, f):
if f == "h":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif f == "v":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
elif f == "hv":
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def remove_empty(d):
for i, row in enumerate(d):
if row == [''] * len(row):
d.pop(i)
d = zip(*d)
d = [list(row) for row in d if any(row)]
d = zip(*d)
return d