81 lines
2.6 KiB
Python
81 lines
2.6 KiB
Python
import os
|
|
import csv
|
|
import numpy as np
|
|
|
|
from pdf import get_pdf_info
|
|
|
|
|
|
def overlap(l):
|
|
merged = []
|
|
for higher in l:
|
|
if not merged:
|
|
merged.append(higher)
|
|
else:
|
|
lower = merged[-1]
|
|
if higher[0] <= lower[1]:
|
|
upper_bound = max(lower[1], higher[1])
|
|
lower_bound = min(lower[0], higher[0])
|
|
merged[-1] = (lower_bound, upper_bound)
|
|
else:
|
|
merged.append(higher)
|
|
return merged
|
|
|
|
|
|
def get_row_idx(t, rows):
|
|
for r in range(len(rows)):
|
|
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
|
|
return r
|
|
|
|
|
|
def get_column_idx(t, columns):
|
|
for c in range(len(columns)):
|
|
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
|
|
return c
|
|
|
|
|
|
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
|
|
print "working on", filename
|
|
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
|
|
char_margin, line_margin, word_margin)
|
|
text.sort(key=lambda x: (-x.y0, x.x0))
|
|
y_last = 0
|
|
data = []
|
|
temp = []
|
|
elements = []
|
|
for t in text:
|
|
# is checking for upright necessary?
|
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
|
# type(obj) is LTChar]):
|
|
if t.get_text().strip():
|
|
if not np.isclose(y_last, t.y0, atol=2):
|
|
y_last = t.y0
|
|
elements.append(len(temp))
|
|
data.append(temp)
|
|
temp = []
|
|
temp.append(t)
|
|
# a table can't have just 1 column, can it?
|
|
elements = filter(lambda x: x != 1, elements)
|
|
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
|
|
mode = max(set(elements), key=elements.count)
|
|
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
|
columns = overlap(sorted(columns))
|
|
columns = [(c[0] + c[1]) / 2.0 for c in columns]
|
|
|
|
output = [['' for c in columns] for d in data]
|
|
for row, d in enumerate(data):
|
|
for t in d:
|
|
cog = (t.x0 + t.x1) / 2.0
|
|
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
|
|
idx = min(diff, key=lambda x: x[1])
|
|
if output[row][idx[0]]:
|
|
output[row][idx[0]] += ' ' + t.get_text().strip()
|
|
else:
|
|
output[row][idx[0]] = t.get_text().strip()
|
|
|
|
csvname = filename.split('.')[0] + '.csv'
|
|
csvpath = os.path.join(pdf_dir, csvname)
|
|
with open(csvpath, 'w') as outfile:
|
|
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
|
|
for row in output:
|
|
writer.writerow([cell.encode('utf-8') for cell in row])
|