103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
import os
|
|
import csv
|
|
import glob
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
from table import Table
|
|
from pdf import get_pdf_info
|
|
from morph_transform import morph
|
|
|
|
def remove_close_values(ar):
|
|
ret = []
|
|
for a in ar:
|
|
if not ret:
|
|
ret.append(a)
|
|
else:
|
|
temp = ret[-1]
|
|
if np.isclose(temp, a, atol=1):
|
|
pass
|
|
else:
|
|
ret.append(a)
|
|
return ret
|
|
|
|
def get_row_idx(t, rows):
|
|
for r in range(len(rows)):
|
|
if abs(t.y0 + t.y1) / 2.0 < rows[r][0] and abs(t.y0 + t.y1) / 2.0 > rows[r][1]:
|
|
return r
|
|
|
|
def get_column_idx(t, columns):
|
|
for c in range(len(columns)):
|
|
if abs(t.x0 + t.x1) / 2.0 > columns[c][0] and abs(t.x0 + t.x1) / 2.0 < columns[c][1]:
|
|
return c
|
|
|
|
def reduce_index(t, r_idx, c_idx):
|
|
if t.cells[r_idx][c_idx].spanning_h:
|
|
while not t.cells[r_idx][c_idx].left:
|
|
c_idx -= 1
|
|
if t.cells[r_idx][c_idx].spanning_v:
|
|
while not t.cells[r_idx][c_idx].top:
|
|
r_idx -= 1
|
|
return r_idx, c_idx
|
|
|
|
def fill(t):
|
|
for i in range(len(t.cells)):
|
|
for j in range(len(t.cells[i])):
|
|
if t.cells[i][j].get_text().strip() == '':
|
|
if t.cells[i][j].spanning_h:
|
|
t.cells[i][j].add_text(t.cells[i][j - 1].get_text())
|
|
elif t.cells[i][j].spanning_v:
|
|
t.cells[i][j].add_text(t.cells[i - 1][j].get_text())
|
|
return t
|
|
|
|
def spreadsheet(pdf_dir, filename, guess, scale):
|
|
print "working on", filename
|
|
imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
|
|
text, pdf_x, pdf_y = get_pdf_info(os.path.join(pdf_dir, filename), 'spreadsheet')
|
|
tables, v_segments, h_segments = morph(imagename, pdf_x, pdf_y, scale)
|
|
|
|
num_tables = 0
|
|
for k in sorted(tables.keys(), reverse=True):
|
|
# find rows and columns that lie in table
|
|
lb = k
|
|
rt = tables[k]
|
|
v_s = [v for v in v_segments if v[1] > lb[1] - 2 and v[3] < rt[1] + 2 and lb[0] - 2 <= v[0] <= rt[0] + 2]
|
|
h_s = [h for h in h_segments if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
|
|
columns = [v[0] for v in v_s]
|
|
rows = [h[1] for h in h_s]
|
|
# sort horizontal and vertical segments
|
|
columns = remove_close_values(sorted(columns))
|
|
rows = remove_close_values(sorted(rows, reverse=True))
|
|
# make grid using x and y coord of shortlisted rows and columns
|
|
columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)]
|
|
rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)]
|
|
|
|
table = Table(columns, rows)
|
|
# pass row and column line segments to table method and light up cell edges
|
|
table = table.set_edges(v_s, h_s)
|
|
# table set span method
|
|
table = table.set_spanning()
|
|
# fill text after sorting it
|
|
text.sort(key=lambda x: (-x.y0, x.x0))
|
|
|
|
for t in text:
|
|
r_idx = get_row_idx(t, rows)
|
|
c_idx = get_column_idx(t, columns)
|
|
if None in [r_idx, c_idx]:
|
|
pass
|
|
else:
|
|
r_idx, c_idx = reduce_index(table, r_idx, c_idx)
|
|
table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
|
|
|
|
if guess:
|
|
table = fill(table)
|
|
|
|
csvname = filename.split('.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
|
|
csvpath = os.path.join(pdf_dir, csvname)
|
|
with open(csvpath, 'w') as outfile:
|
|
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
|
for i in range(len(table.cells)):
|
|
writer.writerow([table.cells[i][j].get_text().strip().encode('utf-8') for j in range(len(table.cells[i]))])
|
|
print "saved as", csvname
|
|
print
|
|
num_tables += 1 |