143 lines
4.3 KiB
Python
143 lines
4.3 KiB
Python
import os
|
|
import numpy as np
|
|
|
|
from pdf import get_pdf_info
|
|
|
|
|
|
def overlap(l):
|
|
"""Groups overlapping columns and returns list with updated
|
|
columns boundaries.
|
|
|
|
Parameters
|
|
----------
|
|
l : list
|
|
List of column x-coordinates.
|
|
|
|
Returns
|
|
-------
|
|
merged : list
|
|
List of merged column x-coordinates.
|
|
"""
|
|
merged = []
|
|
for higher in l:
|
|
if not merged:
|
|
merged.append(higher)
|
|
else:
|
|
lower = merged[-1]
|
|
if higher[0] <= lower[1]:
|
|
upper_bound = max(lower[1], higher[1])
|
|
lower_bound = min(lower[0], higher[0])
|
|
merged[-1] = (lower_bound, upper_bound)
|
|
else:
|
|
merged.append(higher)
|
|
return merged
|
|
|
|
|
|
def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
|
|
line_margin=0.5, word_margin=0.1, debug=False):
|
|
"""Stream algorithm
|
|
|
|
Groups data returned by PDFMiner into rows and finds mode of the
|
|
number of elements in each row to guess number of columns.
|
|
|
|
Parameters
|
|
----------
|
|
filepath : string
|
|
|
|
ncolumns : int, default: 0, optional
|
|
Number of columns.
|
|
|
|
columns : string, default: None, optional
|
|
Comma-separated list of column x-coordinates.
|
|
|
|
char_margin : float, default: 2.0, optional
|
|
Char margin. Chars closer than cmargin are grouped together
|
|
to form a word.
|
|
|
|
line_margin : float, default: 0.5, optional
|
|
Line margin. Lines closer than lmargin are grouped together
|
|
to form a textbox.
|
|
|
|
word_margin : float, default: 0.1, optional
|
|
Word margin. Insert blank spaces between chars if distance
|
|
between words is greater than word margin.
|
|
|
|
debug : bool, default: False, optional
|
|
Debug by visualizing textboxes.
|
|
|
|
Returns
|
|
-------
|
|
output : list
|
|
"""
|
|
filename = os.path.basename(filepath)
|
|
print "working on", filename
|
|
text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
|
|
line_margin=line_margin, word_margin=word_margin)
|
|
text.sort(key=lambda x: (-x.y0, x.x0))
|
|
y_last = 0
|
|
data = []
|
|
temp = []
|
|
elements = []
|
|
for t in text:
|
|
# is checking for upright necessary?
|
|
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
|
|
# type(obj) is LTChar]):
|
|
if t.get_text().strip():
|
|
if not np.isclose(y_last, t.y0, atol=2):
|
|
y_last = t.y0
|
|
elements.append(len(temp))
|
|
data.append(temp)
|
|
temp = []
|
|
temp.append(t)
|
|
|
|
if debug:
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
|
|
fig = plt.figure()
|
|
ax = fig.add_subplot(111, aspect='equal')
|
|
xs, ys = [], []
|
|
for d in data:
|
|
for t in d:
|
|
xs.extend([t.x0, t.x1])
|
|
ys.extend([t.y0, t.y1])
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(t.x0, t.y0),
|
|
t.x1 - t.x0,
|
|
t.y1 - t.y0
|
|
)
|
|
)
|
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
|
plt.show()
|
|
return None
|
|
|
|
if columns:
|
|
cols = [(float(columns[i]), float(columns[i + 1]))
|
|
for i in range(0, len(columns) - 1)]
|
|
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
|
else:
|
|
# a table can't have just 1 column, can it?
|
|
elements = filter(lambda x: x != 1, elements)
|
|
mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
|
|
cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
|
|
cols = overlap(sorted(cols))
|
|
cols = [(c[0] + c[1]) / 2.0 for c in cols]
|
|
|
|
output = [['' for c in cols] for d in data]
|
|
for row, d in enumerate(data):
|
|
for t in d:
|
|
cog = (t.x0 + t.x1) / 2.0
|
|
diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
|
|
if diff:
|
|
idx = min(diff, key=lambda x: x[1])
|
|
else:
|
|
print "couldn't find a table on this page"
|
|
return None
|
|
if output[row][idx[0]]:
|
|
output[row][idx[0]] += ' ' + t.get_text().strip()
|
|
else:
|
|
output[row][idx[0]] = t.get_text().strip()
|
|
|
|
return output |