camelot-py/spreadsheet.py

103 lines
3.2 KiB
Python

import os
import csv
import glob
import numpy as np
import matplotlib.pyplot as plt
from table import Table
from pdf import get_pdf_info
from morph_transform import morph
def remove_close_values(ar):
ret = []
for a in ar:
if not ret:
ret.append(a)
else:
temp = ret[-1]
if np.isclose(temp, a, atol=1):
pass
else:
ret.append(a)
return ret
def get_row_idx(t, rows):
for r in range(len(rows)):
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
return c
def reduce_index(t, r_idx, c_idx):
if t.cells[r_idx][c_idx].spanning_h:
while not t.cells[r_idx][c_idx].left:
c_idx -= 1
if t.cells[r_idx][c_idx].spanning_v:
while not t.cells[r_idx][c_idx].top:
r_idx -= 1
return r_idx, c_idx
def fill(t):
for i in range(len(t.cells)):
for j in range(len(t.cells[i])):
if t.cells[i][j].get_text().strip() == '':
if t.cells[i][j].spanning_h:
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
elif t.cells[i][j].spanning_v:
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
return t
def spreadsheet(pdf_dir, filename, guess, scale):
print "working on", filename
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
num_tables = 0
for k in sorted(tables.keys(), reverse=True):
# find rows and columns that lie in table
lb = k
rt = tables[k]
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
columns = [v[0] for v in v_s]
rows = [h[1] for h in h_s]
# sort horizontal and vertical segments
columns = remove_close_values(sorted(columns))
rows = remove_close_values(sorted(rows, reverse=True))
# make grid using x and y coord of shortlisted rows and columns
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
table = Table(columns, rows)
# pass row and column line segments to table method and light up cell edges
table = table.set_edges(v_s, h_s)
# table set span method
table = table.set_spanning()
# fill text after sorting it
text.sort(key=lambda x: (-x.y0, x.x0))
for t in text:
r_idx = get_row_idx(t, rows)
c_idx = get_column_idx(t, columns)
if None in [r_idx, c_idx]:
pass
else:
r_idx, c_idx = reduce_index(table, r_idx, c_idx)
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
if guess:
table = fill(table)
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for i in range(len(table.cells)):
writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))])
print "saved as", csvname
print
num_tables += 1