import os import numpy as np from pdf import get_pdf_info def overlap(l): """Groups overlapping columns and returns list with updated columns boundaries. Parameters ---------- l : list List of column x-coordinates. Returns ------- merged : list List of merged column x-coordinates. """ merged = [] for higher in l: if not merged: merged.append(higher) else: lower = merged[-1] if higher[0] <= lower[1]: upper_bound = max(lower[1], higher[1]) lower_bound = min(lower[0], higher[0]) merged[-1] = (lower_bound, upper_bound) else: merged.append(higher) return merged def stream(filepath, ncolumns=0, columns=None, char_margin=2.0, line_margin=0.5, word_margin=0.1, debug=False): """Stream algorithm Groups data returned by PDFMiner into rows and finds mode of the number of elements in each row to guess number of columns. Parameters ---------- filepath : string ncolumns : int, default: 0, optional Number of columns. columns : string, default: None, optional Comma-separated list of column x-coordinates. char_margin : float, default: 2.0, optional Char margin. Chars closer than cmargin are grouped together to form a word. line_margin : float, default: 0.5, optional Line margin. Lines closer than lmargin are grouped together to form a textbox. word_margin : float, default: 0.1, optional Word margin. Insert blank spaces between chars if distance between words is greater than word margin. debug : bool, default: False, optional Debug by visualizing textboxes. Returns ------- output : list """ filename = os.path.basename(filepath) print "working on", filename text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) text.sort(key=lambda x: (-x.y0, x.x0)) y_last = 0 data = [] temp = [] elements = [] for t in text: # is checking for upright necessary? # if t.get_text().strip() and all([obj.upright for obj in t._objs if # type(obj) is LTChar]): if t.get_text().strip(): if not np.isclose(y_last, t.y0, atol=2): y_last = t.y0 elements.append(len(temp)) data.append(temp) temp = [] temp.append(t) if debug: import matplotlib.pyplot as plt import matplotlib.patches as patches fig = plt.figure() ax = fig.add_subplot(111, aspect='equal') xs, ys = [], [] for d in data: for t in d: xs.extend([t.x0, t.x1]) ys.extend([t.y0, t.y1]) ax.add_patch( patches.Rectangle( (t.x0, t.y0), t.x1 - t.x0, t.y1 - t.y0 ) ) ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10) plt.show() return None if columns: cols = [(float(columns[i]), float(columns[i + 1])) for i in range(0, len(columns) - 1)] cols = [(c[0] + c[1]) / 2.0 for c in cols] else: # a table can't have just 1 column, can it? elements = filter(lambda x: x != 1, elements) mode = ncolumns if ncolumns else max(set(elements), key=elements.count) cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode] cols = overlap(sorted(cols)) cols = [(c[0] + c[1]) / 2.0 for c in cols] output = [['' for c in cols] for d in data] for row, d in enumerate(data): for t in d: cog = (t.x0 + t.x1) / 2.0 diff = [(i, abs(cog - c)) for i, c in enumerate(cols)] if diff: idx = min(diff, key=lambda x: x[1]) else: print "couldn't find a table on this page" return None if output[row][idx[0]]: output[row][idx[0]] += ' ' + t.get_text().strip() else: output[row][idx[0]] = t.get_text().strip() return output