camelot-py/basic.py

81 lines
2.6 KiB
Python

import os
import csv
import numpy as np
from pdf import get_pdf_info
def overlap(l):
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
def get_row_idx(t, rows):
for r in range(len(rows)):
if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
return r
def get_column_idx(t, columns):
for c in range(len(columns)):
if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
return c
def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
print "working on", filename
text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
char_margin, line_margin, word_margin)
text.sort(key=lambda x: (-x.y0, x.x0))
y_last = 0
data = []
temp = []
elements = []
for t in text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
# type(obj) is LTChar]):
if t.get_text().strip():
if not np.isclose(y_last, t.y0, atol=2):
y_last = t.y0
elements.append(len(temp))
data.append(temp)
temp = []
temp.append(t)
# a table can't have just 1 column, can it?
elements = filter(lambda x: x != 1, elements)
# mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
mode = max(set(elements), key=elements.count)
columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
columns = overlap(sorted(columns))
columns = [(c[0] + c[1]) / 2.0 for c in columns]
output = [['' for c in columns] for d in data]
for row, d in enumerate(data):
for t in d:
cog = (t.x0 + t.x1) / 2.0
diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
idx = min(diff, key=lambda x: x[1])
if output[row][idx[0]]:
output[row][idx[0]] += ' ' + t.get_text().strip()
else:
output[row][idx[0]] = t.get_text().strip()
csvname = filename.split('.')[0] + '.csv'
csvpath = os.path.join(pdf_dir, csvname)
with open(csvpath, 'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
for row in output:
writer.writerow([cell.encode('utf-8') for cell in row])