diff --git a/camelot/lattice.py b/camelot/lattice.py index 041b296..c68520e 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -1,18 +1,31 @@ -from __future__ import print_function +from __future__ import division import os +import types +import copy_reg +import logging import cv2 import numpy as np +from wand.image import Image + from .table import Table from .utils import (transform, elements_bbox, detect_vertical, merge_close_values, - get_row_index, get_column_index, reduce_index, outline, - fill_spanning, remove_empty, encode_list) + get_row_index, get_column_index, get_score, reduce_index, + outline, fill_spanning, count_empty, encode_list, pdf_to_text) __all__ = ['Lattice'] +def _reduce_method(m): + if m.im_self is None: + return getattr, (m.im_class, m.im_func.func_name) + else: + return getattr, (m.im_self, m.im_func.func_name) +copy_reg.pickle(types.MethodType, _reduce_method) + + def _morph_transform(imagename, scale=15, invert=False): """Morphological Transformation @@ -65,8 +78,8 @@ def _morph_transform(imagename, scale=15, invert=False): vertical = threshold horizontal = threshold - verticalsize = vertical.shape[0] / scale - horizontalsize = horizontal.shape[1] / scale + verticalsize = vertical.shape[0] // scale + horizontalsize = horizontal.shape[1] // scale ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) @@ -79,8 +92,12 @@ def _morph_transform(imagename, scale=15, invert=False): mask = vertical + horizontal joints = np.bitwise_and(vertical, horizontal) - __, contours, __ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + try: + __, contours, __ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + contours, __ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] tables = {} @@ -88,8 +105,12 @@ def _morph_transform(imagename, scale=15, invert=False): c_poly = cv2.approxPolyDP(c, 3, True) x, y, w, h = cv2.boundingRect(c_poly) roi = joints[y : y + h, x : x + w] - __, jc, __ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + try: + __, jc, __ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + jc, __ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(jc) <= 4: # remove contours with less than <=4 joints continue joint_coords = [] @@ -100,16 +121,24 @@ def _morph_transform(imagename, scale=15, invert=False): tables[(x, y + h, x + w, y)] = joint_coords v_segments, h_segments = [], [] - _, vcontours, _ = cv2.findContours( - vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + try: + _, vcontours, _ = cv2.findContours( + vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + vcontours, _ = cv2.findContours( + vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for vc in vcontours: x, y, w, h = cv2.boundingRect(vc) x1, x2 = x, x + w y1, y2 = y, y + h v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) - _, hcontours, _ = cv2.findContours( - horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + try: + _, hcontours, _ = cv2.findContours( + horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + hcontours, _ = cv2.findContours( + horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for hc in hcontours: x, y, w, h = cv2.boundingRect(hc) x1, x2 = x, x + w @@ -160,24 +189,19 @@ class Lattice: page as value. """ - def __init__(self, pdfobject, fill=None, scale=15, jtol=2, mtol=2, - invert=False, debug=None, verbose=False): + def __init__(self, fill=None, scale=15, jtol=2, mtol=2, + invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None): - self.pdfobject = pdfobject + self.method = 'lattice' self.fill = fill self.scale = scale self.jtol = jtol self.mtol = mtol self.invert = invert + self.char_margin, self.line_margin, self.word_margin = pdf_margin self.debug = debug - self.verbose = verbose - self.tables = {} - if self.debug is not None: - self.debug_images = {} - self.debug_segments = {} - self.debug_tables = {} - def get_tables(self): + def get_tables(self, pdfname): """Returns all tables found in given pdf. Returns @@ -186,169 +210,124 @@ class Lattice: Dictionary with page number as key and list of tables on that page as value. """ - vprint = print if self.verbose else lambda *a, **k: None - self.pdfobject.split() - self.pdfobject.convert() - for page in self.pdfobject.extract(): - p, text, __, width, height = page - pkey = 'pg-{0}'.format(p) - imagename = os.path.join( - self.pdfobject.temp, '{}.png'.format(pkey)) - pdf_x = width - pdf_y = height - img, table_bbox, v_segments, h_segments = _morph_transform( - imagename, scale=self.scale, invert=self.invert) - img_x = img.shape[1] - img_y = img.shape[0] - scaling_factor_x = pdf_x / float(img_x) - scaling_factor_y = pdf_y / float(img_y) + text, __, width, height = pdf_to_text(pdfname, self.char_margin, + self.line_margin, self.word_margin) + bname, __ = os.path.splitext(pdfname) + if not text: + logging.warning("{0}: PDF has no text. It may be an image.".format( + os.path.basename(bname))) + return None + imagename = ''.join([bname, '.png']) + with Image(filename=pdfname, depth=8, resolution=300) as png: + png.save(filename=imagename) + pdf_x = width + pdf_y = height + img, table_bbox, v_segments, h_segments = _morph_transform( + imagename, scale=self.scale, invert=self.invert) + img_x = img.shape[1] + img_y = img.shape[0] + scaling_factor_x = pdf_x / float(img_x) + scaling_factor_y = pdf_y / float(img_y) - if self.debug is not None: - self.debug_images[pkey] = (img, table_bbox) + if self.debug: + self.debug_images = (img, table_bbox) - factors = (scaling_factor_x, scaling_factor_y, img_y) - table_bbox, v_segments, h_segments = transform(table_bbox, v_segments, - h_segments, factors) + factors = (scaling_factor_x, scaling_factor_y, img_y) + table_bbox, v_segments, h_segments = transform(table_bbox, v_segments, + h_segments, factors) - if self.debug is not None: - self.debug_segments[pkey] = (v_segments, h_segments) + if self.debug: + self.debug_segments = (v_segments, h_segments) + self.debug_tables = [] - if self.debug is not None: - debug_page_tables = [] - page_tables = [] - # sort tables based on y-coord - for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): - # select edges which lie within table_bbox - text_bbox, v_s, h_s = elements_bbox(k, text, v_segments, - h_segments) - rotated = detect_vertical(text_bbox) - cols, rows = zip(*table_bbox[k]) - cols, rows = list(cols), list(rows) - cols.extend([k[0], k[2]]) - rows.extend([k[1], k[3]]) - # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=self.mtol) - rows = merge_close_values( - sorted(rows, reverse=True), mtol=self.mtol) - # make grid using x and y coord of shortlisted rows and cols - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - table = Table(cols, rows) - # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, jtol=self.jtol) - # set spanning cells to True - table = table.set_spanning() - # set table border edges to True - table = outline(table) + pdf_page = {} + page_tables = {} + table_no = 1 + # sort tables based on y-coord + for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True): + # select edges which lie within table_bbox + table_info = {} + text_bbox, v_s, h_s = elements_bbox(k, text, v_segments, + h_segments) + table_info['text_p'] = 100 * (1 - (len(text_bbox) / len(text))) + rotated = detect_vertical(text_bbox) + cols, rows = zip(*table_bbox[k]) + cols, rows = list(cols), list(rows) + cols.extend([k[0], k[2]]) + rows.extend([k[1], k[3]]) + # sort horizontal and vertical segments + cols = merge_close_values(sorted(cols), mtol=self.mtol) + rows = merge_close_values( + sorted(rows, reverse=True), mtol=self.mtol) + # make grid using x and y coord of shortlisted rows and cols + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] + table = Table(cols, rows) + # set table edges to True using ver+hor lines + table = table.set_edges(v_s, h_s, jtol=self.jtol) + nouse = table.nocont_ / (len(v_s) + len(h_s)) + table_info['line_p'] = 100 * (1 - nouse) + # set spanning cells to True + table = table.set_spanning() + # set table border edges to True + table = outline(table) - if self.debug is not None: - debug_page_tables.append(table) + if self.debug: + self.debug_tables.append(table) - # fill text after sorting it - if rotated == '': - text_bbox.sort(key=lambda x: (-x.y0, x.x0)) - elif rotated == 'left': - text_bbox.sort(key=lambda x: (x.x0, x.y0)) - elif rotated == 'right': - text_bbox.sort(key=lambda x: (-x.x0, -x.y0)) - for t in text_bbox: - r_idx = get_row_index(t, rows) - c_idx = get_column_index(t, cols) - if None in [r_idx, c_idx]: - # couldn't assign LTChar to any cell - pass - else: - r_idx, c_idx = reduce_index( - table, rotated, r_idx, c_idx) - table.cells[r_idx][c_idx].add_text( - t.get_text().strip('\n')) + # fill text after sorting it + if rotated == '': + text_bbox.sort(key=lambda x: (-x.y0, x.x0)) + elif rotated == 'left': + text_bbox.sort(key=lambda x: (x.x0, x.y0)) + elif rotated == 'right': + text_bbox.sort(key=lambda x: (-x.x0, -x.y0)) - if self.fill is not None: - table = fill_spanning(table, fill=self.fill) - ar = table.get_list() - if rotated == 'left': - ar = zip(*ar[::-1]) - elif rotated == 'right': - ar = zip(*ar[::1]) - ar.reverse() - ar = remove_empty(ar) - ar = [list(o) for o in ar] - page_tables.append(encode_list(ar)) - vprint(pkey) - self.tables[pkey] = page_tables + rerror = [] + cerror = [] + for t in text_bbox: + try: + r_idx, rass_error = get_row_index(t, rows) + except TypeError: + # couldn't assign LTChar to any cell + continue + try: + c_idx, cass_error = get_column_index(t, cols) + except TypeError: + # couldn't assign LTChar to any cell + continue + rerror.append(rass_error) + cerror.append(cass_error) + r_idx, c_idx = reduce_index( + table, rotated, r_idx, c_idx) + table.cells[r_idx][c_idx].add_text( + t.get_text().strip('\n')) + score = get_score([[50, rerror], [50, cerror]]) + table_info['score'] = score - if self.debug is not None: - self.debug_tables[pkey] = debug_page_tables + if self.fill is not None: + table = fill_spanning(table, fill=self.fill) + ar = table.get_list() + if rotated == 'left': + ar = zip(*ar[::-1]) + elif rotated == 'right': + ar = zip(*ar[::1]) + ar.reverse() + ar = encode_list(ar) + table_info['data'] = ar + empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) + table_info['empty_p'] = empty_p + table_info['r_nempty_cells'] = r_nempty_cells + table_info['c_nempty_cells'] = c_nempty_cells + table_info['nrows'] = len(ar) + table_info['ncols'] = len(ar[0]) + page_tables['table_{0}'.format(table_no)] = table_info + table_no += 1 + pdf_page[os.path.basename(bname)] = page_tables - if self.pdfobject.clean: - self.pdfobject.remove_tempdir() - - if self.debug is not None: + if self.debug: return None - return self.tables - - def plot_geometry(self, geometry): - """Plots various pdf geometries that are detected so user can choose - tweak scale, jtol, mtol parameters. - """ - import matplotlib.pyplot as plt - - if geometry == 'contour': - for pkey in self.debug_images.keys(): - img, table_bbox = self.debug_images[pkey] - for t in table_bbox.keys(): - cv2.rectangle(img, (t[0], t[1]), - (t[2], t[3]), (255, 0, 0), 3) - plt.imshow(img) - plt.show() - elif geometry == 'joint': - x_coord = [] - y_coord = [] - for pkey in self.debug_images.keys(): - img, table_bbox = self.debug_images[pkey] - for k in table_bbox.keys(): - for coord in table_bbox[k]: - x_coord.append(coord[0]) - y_coord.append(coord[1]) - max_x, max_y = max(x_coord), max(y_coord) - plt.plot(x_coord, y_coord, 'ro') - plt.axis([0, max_x + 100, max_y + 100, 0]) - plt.imshow(img) - plt.show() - elif geometry == 'line': - for pkey in self.debug_segments.keys(): - v_s, h_s = self.debug_segments[pkey] - for v in v_s: - plt.plot([v[0], v[2]], [v[1], v[3]]) - for h in h_s: - plt.plot([h[0], h[2]], [h[1], h[3]]) - plt.show() - elif geometry == 'table': - for pkey in self.debug_tables.keys(): - for table in self.debug_tables[pkey]: - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - if table.cells[i][j].left: - plt.plot([table.cells[i][j].lb[0], - table.cells[i][j].lt[0]], - [table.cells[i][j].lb[1], - table.cells[i][j].lt[1]]) - if table.cells[i][j].right: - plt.plot([table.cells[i][j].rb[0], - table.cells[i][j].rt[0]], - [table.cells[i][j].rb[1], - table.cells[i][j].rt[1]]) - if table.cells[i][j].top: - plt.plot([table.cells[i][j].lt[0], - table.cells[i][j].rt[0]], - [table.cells[i][j].lt[1], - table.cells[i][j].rt[1]]) - if table.cells[i][j].bottom: - plt.plot([table.cells[i][j].lb[0], - table.cells[i][j].rb[0]], - [table.cells[i][j].lb[1], - table.cells[i][j].rb[1]]) - plt.show() \ No newline at end of file + return pdf_page \ No newline at end of file diff --git a/camelot/pdf.py b/camelot/pdf.py index ce8783c..e552e98 100644 --- a/camelot/pdf.py +++ b/camelot/pdf.py @@ -1,18 +1,11 @@ import os import shutil import tempfile +import itertools +import multiprocessing as mp +import cv2 from PyPDF2 import PdfFileReader, PdfFileWriter -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfpage import PDFTextExtractionNotAllowed -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfinterp import PDFPageInterpreter -from pdfminer.pdfdevice import PDFDevice -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal -from wand.image import Image __all__ = ['Pdf'] @@ -38,38 +31,6 @@ def _parse_page_numbers(pagenos): return page_numbers -def _extract_text_objects(layout, LTObject, t=None): - """Recursively parses pdf layout to get a list of - text objects. - - Parameters - ---------- - layout : object - Layout object. - - LTObject : object - Text object, either LTChar or LTTextLineHorizontal. - - t : list (optional, default: None) - - Returns - ------- - t : list - List of text objects. - """ - if t is None: - t = [] - try: - for obj in layout._objs: - if isinstance(obj, LTObject): - t.append(obj) - else: - t += _extract_text_objects(obj, LTObject) - except AttributeError: - pass - return t - - class Pdf: """Handles all pdf operations which include: @@ -99,66 +60,163 @@ class Pdf: is greater than word_margin. (optional, default: 0.1) """ - def __init__(self, pdfname, pagenos=[{'start': 1, 'end': 1}], - char_margin=2.0, line_margin=0.5, word_margin=0.1, - clean=False): + def __init__(self, extractor, pdfname, pagenos=[{'start': 1, 'end': 1}], + parallel=False, clean=False): + self.extractor = extractor self.pdfname = pdfname + if not self.pdfname.endswith('.pdf'): + raise TypeError("Only PDF format is supported right now.") self.pagenos = _parse_page_numbers(pagenos) - self.char_margin = char_margin - self.line_margin = line_margin - self.word_margin = word_margin + self.parallel = parallel + self.cpu_count = mp.cpu_count() + self.pool = mp.Pool(processes=self.cpu_count) self.clean = clean self.temp = tempfile.mkdtemp() def split(self): """Splits pdf into single page pdfs. """ - if not self.pdfname.endswith('.pdf'): - raise TypeError("Only PDF format is supported.") infile = PdfFileReader(open(self.pdfname, 'rb'), strict=False) for p in self.pagenos: page = infile.getPage(p - 1) outfile = PdfFileWriter() outfile.addPage(page) - with open(os.path.join(self.temp, 'pg-{0}.pdf'.format(p)), 'wb') as f: + with open(os.path.join(self.temp, 'page-{0}.pdf'.format(p)), 'wb') as f: outfile.write(f) + def remove_tempdir(self): + shutil.rmtree(self.temp) + def extract(self): """Extracts text objects, width, height from a pdf. """ - for p in self.pagenos: - pkey = 'pg-{0}'.format(p) - pname = os.path.join(self.temp, '{}.pdf'.format(pkey)) - with open(pname, 'r') as f: - parser = PDFParser(f) - document = PDFDocument(parser) - if not document.is_extractable: - raise PDFTextExtractionNotAllowed - laparams = LAParams(char_margin=self.char_margin, - line_margin=self.line_margin, - word_margin=self.word_margin) - rsrcmgr = PDFResourceManager() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() - lattice_objects = _extract_text_objects(layout, LTChar) - stream_objects = _extract_text_objects( - layout, LTTextLineHorizontal) - width = layout.bbox[2] - height = layout.bbox[3] - yield p, lattice_objects, stream_objects, width, height + self.split() + pages = [os.path.join(self.temp, 'page-{0}.pdf'.format(p)) + for p in self.pagenos] + if self.parallel: + tables = self.pool.map(self.extractor.get_tables, pages) + tables = {k: v for d in tables if d is not None for k, v in d.items()} + else: + tables = {} + if self.extractor.debug: + if self.extractor.method == 'stream': + self.debug = self.extractor.debug + self.debug_text = [] + elif self.extractor.method == 'lattice': + self.debug = self.extractor.debug + self.debug_images = [] + self.debug_segments = [] + self.debug_tables = [] + for p in pages: + table = self.extractor.get_tables(p) + if table is not None: + tables.update(table) + if self.extractor.debug: + if self.extractor.method == 'stream': + self.debug_text.append(self.extractor.debug_text) + elif self.extractor.method == 'lattice': + self.debug_images.append(self.extractor.debug_images) + self.debug_segments.append(self.extractor.debug_segments) + self.debug_tables.append(self.extractor.debug_tables) + if self.clean: + self.remove_tempdir() + return tables - def convert(self): - """Converts single page pdfs to images. + def debug_plot(self): + """Plots all text objects and various pdf geometries so that + user can choose number of columns, columns x-coordinates for + Stream or tweak Lattice parameters (scale, jtol, mtol). """ - for p in self.pagenos: - pdfname = os.path.join(self.temp, 'pg-{0}.pdf'.format(p)) - imagename = os.path.join(self.temp, 'pg-{0}.png'.format(p)) - with Image(filename=pdfname, depth=8, resolution=300) as png: - png.save(filename=imagename) + import matplotlib.pyplot as plt + import matplotlib.patches as patches - def remove_tempdir(self): - shutil.rmtree(self.temp) + if self.debug is True: + try: + for text in self.debug_text: + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for t in text: + xs.extend([t[0], t[1]]) + ys.extend([t[2], t[3]]) + ax.add_patch( + patches.Rectangle( + (t[0], t[1]), + t[2] - t[0], + t[3] - t[1] + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + plt.show() + except AttributeError: + raise ValueError("This option only be used with Stream.") + elif self.debug == 'contour': + try: + for img, table_bbox in self.debug_images: + for t in table_bbox.keys(): + cv2.rectangle(img, (t[0], t[1]), + (t[2], t[3]), (255, 0, 0), 3) + plt.imshow(img) + plt.show() + except AttributeError: + raise ValueError("This option only be used with Lattice.") + elif self.debug == 'joint': + try: + for img, table_bbox in self.debug_images: + x_coord = [] + y_coord = [] + for k in table_bbox.keys(): + for coord in table_bbox[k]: + x_coord.append(coord[0]) + y_coord.append(coord[1]) + max_x, max_y = max(x_coord), max(y_coord) + plt.plot(x_coord, y_coord, 'ro') + plt.axis([0, max_x + 100, max_y + 100, 0]) + plt.imshow(img) + plt.show() + except AttributeError: + raise ValueError("This option only be used with Lattice.") + elif self.debug == 'line': + try: + for v_s, h_s in self.debug_segments: + for v in v_s: + plt.plot([v[0], v[2]], [v[1], v[3]]) + for h in h_s: + plt.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() + except AttributeError: + raise ValueError("This option only be used with Lattice.") + elif self.debug == 'table': + try: + for tables in self.debug_tables: + for table in tables: + for i in range(len(table.cells)): + for j in range(len(table.cells[i])): + if table.cells[i][j].left: + plt.plot([table.cells[i][j].lb[0], + table.cells[i][j].lt[0]], + [table.cells[i][j].lb[1], + table.cells[i][j].lt[1]]) + if table.cells[i][j].right: + plt.plot([table.cells[i][j].rb[0], + table.cells[i][j].rt[0]], + [table.cells[i][j].rb[1], + table.cells[i][j].rt[1]]) + if table.cells[i][j].top: + plt.plot([table.cells[i][j].lt[0], + table.cells[i][j].rt[0]], + [table.cells[i][j].lt[1], + table.cells[i][j].rt[1]]) + if table.cells[i][j].bottom: + plt.plot([table.cells[i][j].lb[0], + table.cells[i][j].rb[0]], + [table.cells[i][j].lb[1], + table.cells[i][j].rb[1]]) + plt.show() + except AttributeError: + raise ValueError("This option only be used with Lattice.") + else: + raise UserWarning("This method can only be called after" + " debug has been specified.") \ No newline at end of file diff --git a/camelot/stream.py b/camelot/stream.py index 7bb09ae..790dfb2 100644 --- a/camelot/stream.py +++ b/camelot/stream.py @@ -1,14 +1,26 @@ -from __future__ import print_function +from __future__ import division import os +import types +import copy_reg +import logging import numpy as np -from .utils import get_column_index, encode_list +from .table import Table +from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text __all__ = ['Stream'] +def _reduce_method(m): + if m.im_self is None: + return getattr, (m.im_class, m.im_func.func_name) + else: + return getattr, (m.im_self, m.im_func.func_name) +copy_reg.pickle(types.MethodType, _reduce_method) + + def _group_rows(text, ytol=2): """Groups text objects into rows using ytol. @@ -35,14 +47,16 @@ def _group_rows(text, ytol=2): # type(obj) is LTChar]): if t.get_text().strip(): if not np.isclose(row_y, t.y0, atol=ytol): - row_y = t.y0 - rows.append(temp) + rows.append(sorted(temp, key=lambda t: t.x0)) temp = [] + row_y = t.y0 temp.append(t) + rows.append(sorted(temp, key=lambda t: t.x0)) + __ = rows.pop(0) # hacky return rows -def _merge_columns(l): +def _merge_columns(l, mtol=2): """Merges overlapping columns and returns list with updated columns boundaries. @@ -62,7 +76,8 @@ def _merge_columns(l): merged.append(higher) else: lower = merged[-1] - if higher[0] <= lower[1]: + if (higher[0] <= lower[1] or + np.isclose(higher[0], lower[1], atol=mtol)): upper_bound = max(lower[1], higher[1]) lower_bound = min(lower[0], higher[0]) merged[-1] = (lower_bound, upper_bound) @@ -71,6 +86,62 @@ def _merge_columns(l): return merged +def _get_column_index(t, columns): + """Gets index of the column in which the given object falls by + comparing their co-ordinates. + + Parameters + ---------- + t : object + + columns : list + + Returns + ------- + c : int + """ + offset1, offset2 = 0, 0 + lt_col_overlap = [] + for c in columns: + if c[0] <= t.x1 and c[1] >= t.x0: + left = t.x0 if c[0] <= t.x0 else c[0] + right = t.x1 if c[1] >= t.x1 else c[1] + lt_col_overlap.append(abs(left - right) / abs(c[0] - c[1])) + else: + lt_col_overlap.append(-1) + if len(filter(lambda x: x != -1, lt_col_overlap)) == 0: + logging.warning("Text doesn't fit any column.") + c_idx = lt_col_overlap.index(max(lt_col_overlap)) + if t.x0 < columns[c_idx][0]: + offset1 = abs(t.x0 - columns[c_idx][0]) + if t.x1 > columns[c_idx][1]: + offset2 = abs(t.x1 - columns[c_idx][1]) + Y = abs(t.y0 - t.y1) + charea = abs(t.x0 - t.x1) * abs(t.y0 - t.y1) + error = (Y * (offset1 + offset2)) / charea + return c_idx, error + + +def _add_columns(cols, text, ytolerance): + if text: + text = _group_rows(text, ytol=ytolerance) + elements = [len(r) for r in text] + new_cols = [(t.x0, t.x1) + for r in text if len(r) == max(elements) for t in r] + cols.extend(_merge_columns(sorted(new_cols))) + return cols + + +def _join_columns(cols, width): + cols = sorted(cols) + cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))] + cols.insert(0, 0) + cols.append(width) # or some tolerance + cols = [(cols[i], cols[i + 1]) + for i in range(0, len(cols) - 1)] + return cols + + class Stream: """Stream algorithm @@ -105,20 +176,18 @@ class Stream: page as value. """ - def __init__(self, pdfobject, ncolumns=0, columns=None, ytol=2, - debug=False, verbose=False): + def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2, + pdf_margin=(2.0, 0.5, 0.1), debug=False): - self.pdfobject = pdfobject + self.method = 'stream' self.ncolumns = ncolumns self.columns = columns self.ytol = ytol + self.mtol = mtol + self.char_margin, self.line_margin, self.word_margin = pdf_margin self.debug = debug - self.verbose = verbose - self.tables = {} - if self.debug: - self.debug_text = {} - def get_tables(self): + def get_tables(self, pdfname): """Returns all tables found in given pdf. Returns @@ -127,86 +196,112 @@ class Stream: Dictionary with page number as key and list of tables on that page as value. """ - vprint = print if self.verbose else lambda *a, **k: None - self.pdfobject.split() - for page in self.pdfobject.extract(): - p, __, text, __, __ = page - pkey = 'pg-{0}'.format(p) - text.sort(key=lambda x: (-x.y0, x.x0)) - - if self.debug: - self.debug_text[pkey] = text - - rows = _group_rows(text, ytol=self.ytol) - elements = [len(r) for r in rows] - # a table can't have just 1 column, can it? - elements = filter(lambda x: x != 1, elements) - - guess = False - if self.columns: - cols = self.columns.split(',') - cols = [(float(cols[i]), float(cols[i + 1])) - for i in range(0, len(cols) - 1)] - else: - guess = True - ncols = self.ncolumns if self.ncolumns else max( - set(elements), key=elements.count) - if ncols == 0: - # no tables detected - continue - cols = [(t.x0, t.x1) - for r in rows for t in r if len(r) == ncols] - cols = _merge_columns(sorted(cols)) - cols = [(c[0] + c[1]) / 2.0 for c in cols] - - ar = [['' for c in cols] for r in rows] - for r_idx, r in enumerate(rows): - for t in r: - if guess: - cog = (t.x0 + t.x1) / 2.0 - diff = [abs(cog - c) for c in cols] - c_idx = diff.index(min(diff)) - else: - c_idx = get_column_index(t, cols) - if None in [r_idx, c_idx]: # couldn't assign LTTextLH to any cell - continue - if ar[r_idx][c_idx]: - ar[r_idx][c_idx] = ' '.join( - [ar[r_idx][c_idx], t.get_text().strip()]) - else: - ar[r_idx][c_idx] = t.get_text().strip() - vprint(pkey) - self.tables[pkey] = [encode_list(ar)] - - if self.pdfobject.clean: - self.pdfobject.remove_tempdir() + __, text, width, height = pdf_to_text(pdfname, self.char_margin, + self.line_margin, self.word_margin) + bname, __ = os.path.splitext(pdfname) + if not text: + logging.warning("{0}: PDF has no text. It may be an image.".format( + os.path.basename(bname))) + return None + text.sort(key=lambda x: (-x.y0, x.x0)) if self.debug: + self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text] return None - return self.tables + rows_grouped = _group_rows(text, ytol=self.ytol) + elements = [len(r) for r in rows_grouped] + row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r) + if len(r) > 0 else 0 for r in rows_grouped] + rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))] + rows.insert(0, height) # or some tolerance + rows.append(0) + rows = [(rows[i], rows[i + 1]) + for i in range(0, len(rows) - 1)] - def plot_text(self): - """Plots all text objects so user can choose number of columns - or columns x-coordinates using the matplotlib interface. - """ - import matplotlib.pyplot as plt - import matplotlib.patches as patches + guess = False + if self.columns: + # user has to input boundary columns too + # take (0, width) by default + # similar to else condition + # len can't be 1 + cols = self.columns.split(',') + cols = [(float(cols[i]), float(cols[i + 1])) + for i in range(0, len(cols) - 1)] + else: + if self.ncolumns: + ncols = self.ncolumns + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = _merge_columns(sorted(cols), mtol=self.mtol) + if len(cols) != self.ncolumns: + logging.warning("{}: The number of columns after merge" + " isn't the same as what you specified." + " Change the value of mtol.".format( + os.path.basename(bname))) + cols = _join_columns(cols, width) + else: + guess = True + ncols = max(set(elements), key=elements.count) + len_non_mode = len(filter(lambda x: x != ncols, elements)) + if ncols == 1 and not self.debug: + # no tables detected + logging.warning("{}: Only one column was detected, the PDF" + " may have no tables. Specify ncols if" + " the PDF has tables.".format( + os.path.basename(bname))) + cols = [(t.x0, t.x1) + for r in rows_grouped if len(r) == ncols for t in r] + cols = _merge_columns(sorted(cols), mtol=self.mtol) + inner_text = [] + for i in range(1, len(cols)): + left = cols[i - 1][1] + right = cols[i][0] + inner_text.extend([t for t in text if t.x0 > left and t.x1 < right]) + outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] + inner_text.extend(outer_text) + cols = _add_columns(cols, inner_text, self.ytol) + cols = _join_columns(cols, width) - for pkey in sorted(self.debug_text.keys()): - fig = plt.figure() - ax = fig.add_subplot(111, aspect='equal') - xs, ys = [], [] - for t in self.debug_text[pkey]: - xs.extend([t.x0, t.x1]) - ys.extend([t.y0, t.y1]) - ax.add_patch( - patches.Rectangle( - (t.x0, t.y0), - t.x1 - t.x0, - t.y1 - t.y0 - ) - ) - ax.set_xlim(min(xs) - 10, max(xs) + 10) - ax.set_ylim(min(ys) - 10, max(ys) + 10) - plt.show() + pdf_page = {} + page_tables = {} + table_info = {} + table = Table(cols, rows) + rerror = [] + cerror = [] + for row in rows_grouped: + for t in row: + try: + r_idx, rass_error = get_row_index(t, rows) + except ValueError as e: + # couldn't assign LTTextLH to any cell + vprint(e.message) + continue + try: + c_idx, cass_error = _get_column_index(t, cols) + except ValueError as e: + # couldn't assign LTTextLH to any cell + vprint(e.message) + continue + rerror.append(rass_error) + cerror.append(cass_error) + table.cells[r_idx][c_idx].add_text( + t.get_text().strip('\n')) + if guess: + score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]]) + else: + score = get_score([[50, rerror], [50, cerror]]) + table_info['score'] = score + ar = table.get_list() + ar = encode_list(ar) + table_info['data'] = ar + empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar) + table_info['empty_p'] = empty_p + table_info['r_nempty_cells'] = r_nempty_cells + table_info['c_nempty_cells'] = c_nempty_cells + table_info['nrows'] = len(ar) + table_info['ncols'] = len(ar[0]) + page_tables['table_1'] = table_info + pdf_page[os.path.basename(bname)] = page_tables + + return pdf_page \ No newline at end of file diff --git a/camelot/table.py b/camelot/table.py index 2b7a126..5956f37 100644 --- a/camelot/table.py +++ b/camelot/table.py @@ -26,6 +26,7 @@ class Table: self.rows = rows self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in cols] for r in rows] + self.nocont_ = 0 def set_edges(self, vertical, horizontal, jtol=2): """Sets cell edges to True if corresponding line segments @@ -53,6 +54,7 @@ class Table: k = [k for k, t in enumerate(self.rows) if np.isclose(v[1], t[0], atol=jtol)] if not j: + self.nocont_ += 1 continue J = j[0] if i == [0]: # only left edge @@ -104,6 +106,7 @@ class Table: k = [k for k, t in enumerate(self.cols) if np.isclose(h[2], t[0], atol=jtol)] if not j: + self.nocont_ += 1 continue J = j[0] if i == [0]: # only top edge diff --git a/camelot/utils.py b/camelot/utils.py index 8ef0db0..29d82b4 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,5 +1,18 @@ +from __future__ import division +import os + import numpy as np +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfpage import PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfdevice import PDFDevice +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal + def translate(x1, x2): """Translates x2 by x1. @@ -243,15 +256,24 @@ def get_row_index(t, rows): ---------- t : object - rows : list + rows : list, sorted in decreasing order Returns ------- r : int """ + offset1, offset2 = 0, 0 for r in range(len(rows)): if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]: - return r + if t.y0 > rows[r][0]: + offset1 = abs(t.y0 - rows[r][0]) + if t.y1 < rows[r][1]: + offset2 = abs(t.y1 - rows[r][1]) + X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) + Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) + charea = X * Y + error = (X * (offset1 + offset2)) / charea + return r, error def get_column_index(t, columns): @@ -268,9 +290,45 @@ def get_column_index(t, columns): ------- c : int """ + offset1, offset2 = 0, 0 for c in range(len(columns)): if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]: - return c + if t.x0 < columns[c][0]: + offset1 = abs(t.x0 - columns[c][0]) + if t.x1 > columns[c][1]: + offset2 = abs(t.x1 - columns[c][1]) + X = 1.0 if abs(t.x0 - t.x1) == 0.0 else abs(t.x0 - t.x1) + Y = 1.0 if abs(t.y0 - t.y1) == 0.0 else abs(t.y0 - t.y1) + charea = X * Y + error = (Y * (offset1 + offset2)) / charea + return c, error + + +def get_score(error_weights): + """Calculates score based on weights assigned to various parameters, + and their error percentages. + + Parameters + ---------- + error_weights : dict + Dict with a tuple of error percentages as key and weightage + assigned to them as value. Sum of all values should be equal + to 100. + + Returns + ------- + score : float + """ + SCORE_VAL = 100 + score = 0 + if sum([ew[0] for ew in error_weights]) != SCORE_VAL: + raise ValueError("Please assign a valid weightage to each parameter" + " such that their sum is equal to 100") + for ew in error_weights: + weight = ew[0] / len(ew[1]) + for error_percentage in ew[1]: + score += weight * (1 - error_percentage) + return score def reduce_index(t, rotated, r_idx, c_idx): @@ -394,6 +452,110 @@ def remove_empty(d): return d +def count_empty(d): + """Counts empty rows and columns from list of lists. + + Parameters + ---------- + d : list + + Returns + ------- + n_empty_rows : number of empty rows + n_empty_cols : number of empty columns + empty_p : percentage of empty cells + """ + empty_p = 0 + r_nempty_cells, c_nempty_cells = [], [] + for i in d: + for j in i: + if j.strip() == '': + empty_p += 1 + empty_p = 100 * (empty_p / float(len(d) * len(d[0]))) + for row in d: + r_nempty_c = 0 + for r in row: + if r.strip() != '': + r_nempty_c += 1 + r_nempty_cells.append(r_nempty_c) + d = zip(*d) + d = [list(col) for col in d] + for col in d: + c_nempty_c = 0 + for c in col: + if c.strip() != '': + c_nempty_c += 1 + c_nempty_cells.append(c_nempty_c) + return empty_p, r_nempty_cells, c_nempty_cells + + def encode_list(ar): + """Encodes list of text. + + Parameters + ---------- + ar : list + + Returns + ------- + ar : list + """ ar = [[r.encode('utf-8') for r in row] for row in ar] return ar + + +def extract_text_objects(layout, LTObject, t=None): + """Recursively parses pdf layout to get a list of + text objects. + + Parameters + ---------- + layout : object + Layout object. + + LTObject : object + Text object, either LTChar or LTTextLineHorizontal. + + t : list (optional, default: None) + + Returns + ------- + t : list + List of text objects. + """ + if t is None: + t = [] + try: + for obj in layout._objs: + if isinstance(obj, LTObject): + t.append(obj) + else: + t += extract_text_objects(obj, LTObject) + except AttributeError: + pass + return t + + +def pdf_to_text(pname, char_margin, line_margin, word_margin): + # pkey = 'page-{0}'.format(p) + # pname = os.path.join(self.temp, '{}.pdf'.format(pkey)) + with open(pname, 'r') as f: + parser = PDFParser(f) + document = PDFDocument(parser) + if not document.is_extractable: + raise PDFTextExtractionNotAllowed + laparams = LAParams(char_margin=char_margin, + line_margin=line_margin, + word_margin=word_margin) + rsrcmgr = PDFResourceManager() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.create_pages(document): + interpreter.process_page(page) + layout = device.get_result() + lattice_objects = extract_text_objects(layout, LTChar) + stream_objects = extract_text_objects( + layout, LTTextLineHorizontal) + width = layout.bbox[2] + height = layout.bbox[3] + return lattice_objects, stream_objects, width, height \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index d22bba9..3e246a7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,7 +39,7 @@ Usage >>> extractor = Lattice(Pdf('us-030.pdf')) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" diff --git a/docs/lattice.rst b/docs/lattice.rst index b4a0fda..fe170f7 100644 --- a/docs/lattice.rst +++ b/docs/lattice.rst @@ -65,7 +65,7 @@ Finally, the characters found on the page are assigned to cells based on their x >>> extractor = Lattice(Pdf('us-030.pdf')) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","","" @@ -114,7 +114,7 @@ In the PDF used above, you can see that some cells spanned a lot of rows, `fill` >>> extractor = Lattice(Pdf('row_span_1.pdf'), fill='v', scale=40) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: :header: "Plan Type","County","Plan Name","Totals" @@ -173,7 +173,7 @@ To find line segments, Lattice needs the lines of the PDF to be in foreground. S >>> extractor = Lattice(Pdf('lines_in_background_1.pdf'), invert=True) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: :header: "State","Date","Halt stations","Halt days","Persons directly reached(in lakh)","Persons trained","Persons counseled","Persons testedfor HIV" diff --git a/docs/stream.rst b/docs/stream.rst index 31adde2..6592775 100644 --- a/docs/stream.rst +++ b/docs/stream.rst @@ -17,7 +17,7 @@ Let's run it on this PDF. >>> extractor = Stream(Pdf('eu-027.pdf')) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. .. _this: insert link for eu-027.pdf @@ -68,7 +68,7 @@ But sometimes its guess could be incorrect, like in this case. >>> extractor = Stream(Pdf('missing_values.pdf')) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. .. _this: insert link for missing_values.pdf @@ -127,7 +127,7 @@ It guessed that the PDF has 3 columns, because there wasn't any data in the last >>> extractor = Stream(Pdf('missing_values.pdf'), ncolumns=5) >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: @@ -200,7 +200,7 @@ After getting the x-coordinates, we just need to pass them to Stream, like this. >>> extractor = Stream(Pdf('mexican_towns.pdf'), columns='28,67,180,230,425,475,700') >>> tables = extractor.get_tables() - >>> print tables['pg-1'] + >>> print tables['page-1'][0] .. csv-table:: diff --git a/tests/test_lattice.py b/tests/test_lattice.py index 244c937..55e3086 100644 --- a/tests/test_lattice.py +++ b/tests/test_lattice.py @@ -26,7 +26,7 @@ def test_lattice_basic(): extractor = Lattice(Pdf(pdfname, pagenos=[{'start': 2, 'end': 2}], clean=True)) tables = extractor.get_tables() - assert_equal(tables['pg-2'][0], data) + assert_equal(tables['page-2'][0], data) def test_lattice_fill(): @@ -76,7 +76,7 @@ def test_lattice_fill(): pdfname = os.path.join(testdir, 'row_span_1.pdf') extractor = Lattice(Pdf(pdfname, clean=True), fill='v', scale=40) tables = extractor.get_tables() - assert_equal(tables['pg-1'][0], data) + assert_equal(tables['pagea-1'][0], data) def test_lattice_invert(): @@ -94,4 +94,4 @@ def test_lattice_invert(): pdfname = os.path.join(testdir, 'lines_in_background_1.pdf') extractor = Lattice(Pdf(pdfname, clean=True), invert=True) tables = extractor.get_tables() - assert_equal(tables['pg-1'][1], data) \ No newline at end of file + assert_equal(tables['page-1'][1], data) \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py index 34d45bf..2a3d05e 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -13,57 +13,62 @@ testdir = os.path.dirname(os.path.abspath(__file__)) def test_stream_basic(): data = [ - ["","","","",""], - ["C Appendix C: Summary Statistics","","","",""], - ["","Table C1: Summary Statistics","","",""], - ["","This table contains summary statistics for 2,012 respondents in SAVE 2009.","","",""], - ["Variable","Mean","Std. Dev. Min","","Max"], - ["Age","50.8","15.9","21","90"], - ["Men","0.47","0.50","0","1"], - ["East","0.28","0.45","0","1"], - ["Rural","0.15","0.36","0","1"], - ["Married","0.57","0.50","0","1"], - ["Single","0.21","0.40","0","1"], - ["Divorced","0.13","0.33","0","1"], - ["Widowed","0.08","0.26","0","1"], - ["Separated","0.03","0.16","0","1"], - ["Partner","0.65","0.48","0","1"], - ["Employed","0.55","0.50","0","1"], - ["Fulltime","0.34","0.47","0","1"], - ["Parttime","0.20","0.40","0","1"], - ["Unemployed","0.08","0.28","0","1"], - ["Homemaker","0.19","0.40","0","1"], - ["Retired","0.28","0.45","0","1"], - ["Household size","2.43","1.22","1","9"], - ["Households with children","0.37","0.48","0","1"], - ["Number of children","1.67","1.38","0","8"], - ["Lower secondary education","0.08","0.27","0","1"], - ["Upper secondary education","0.60","0.49","0","1"], - ["Post secondary, non tert. education","0.12","0.33","0","1"], - ["First stage tertiary education","0.17","0.38","0","1"], - ["Other education","0.03","0.17","0","1"], - ["Household income (Euro/month)","2,127","1,389","22","22,500"], - ["Gross wealth - end of 2007 (Euro)","187,281","384,198","0","7,720,000"], - ["Gross financial wealth - end of 2007 (Euro)","38,855","114,128","0","2,870,000"], - ["","Source: SAVE 2008 and 2009, data is weighted and imputed.","","",""], - ["","","","","ECB"], - ["","","","","Working Paper Series No 1299"], - ["","","","","Febuary 2011"] + ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], + ["Entidad","","Municipio","","Localidad",""], + ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], + ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], + ["01","Aguascalientes","001","Aguascalientes","0100","Rancho Alegre"], + ["01","Aguascalientes","001","Aguascalientes","0102","Los Arbolitos [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0104","Ardillas de Abajo (Las Ardillas)"], + ["01","Aguascalientes","001","Aguascalientes","0106","Arellano"], + ["01","Aguascalientes","001","Aguascalientes","0112","Bajío los Vázquez"], + ["01","Aguascalientes","001","Aguascalientes","0113","Bajío de Montoro"], + ["01","Aguascalientes","001","Aguascalientes","0114","Residencial San Nicolás [Baños la Cantera]"], + ["01","Aguascalientes","001","Aguascalientes","0120","Buenavista de Peñuelas"], + ["01","Aguascalientes","001","Aguascalientes","0121","Cabecita 3 Marías (Rancho Nuevo)"], + ["01","Aguascalientes","001","Aguascalientes","0125","Cañada Grande de Cotorina"], + ["01","Aguascalientes","001","Aguascalientes","0126","Cañada Honda [Estación]"], + ["01","Aguascalientes","001","Aguascalientes","0127","Los Caños"], + ["01","Aguascalientes","001","Aguascalientes","0128","El Cariñán"], + ["01","Aguascalientes","001","Aguascalientes","0129","El Carmen [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0135","El Cedazo (Cedazo de San Antonio)"], + ["01","Aguascalientes","001","Aguascalientes","0138","Centro de Arriba (El Taray)"], + ["01","Aguascalientes","001","Aguascalientes","0139","Cieneguilla (La Lumbrera)"], + ["01","Aguascalientes","001","Aguascalientes","0141","Cobos"], + ["01","Aguascalientes","001","Aguascalientes","0144","El Colorado (El Soyatal)"], + ["01","Aguascalientes","001","Aguascalientes","0146","El Conejal"], + ["01","Aguascalientes","001","Aguascalientes","0157","Cotorina de Abajo"], + ["01","Aguascalientes","001","Aguascalientes","0162","Coyotes"], + ["01","Aguascalientes","001","Aguascalientes","0166","La Huerta (La Cruz)"], + ["01","Aguascalientes","001","Aguascalientes","0170","Cuauhtémoc (Las Palomas)"], + ["01","Aguascalientes","001","Aguascalientes","0171","Los Cuervos (Los Ojos de Agua)"], + ["01","Aguascalientes","001","Aguascalientes","0172","San José [Granja]"], + ["01","Aguascalientes","001","Aguascalientes","0176","La Chiripa"], + ["01","Aguascalientes","001","Aguascalientes","0182","Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0183","Los Dolores"], + ["01","Aguascalientes","001","Aguascalientes","0190","El Duraznillo"], + ["01","Aguascalientes","001","Aguascalientes","0191","Los Durón"], + ["01","Aguascalientes","001","Aguascalientes","0197","La Escondida"], + ["01","Aguascalientes","001","Aguascalientes","0201","Brande Vin [Bodegas]"], + ["01","Aguascalientes","001","Aguascalientes","0207","Valle Redondo"], + ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], + ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], + ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], + ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], + ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], ] - pdfname = os.path.join(testdir, - "tabula_test_pdfs/icdar2013-dataset/competition-dataset-eu/eu-027.pdf") - extractor = Stream(Pdf(pdfname, pagenos=[{'start': 3, 'end': 3}], + pdfname = os.path.join(testdir, 'mexican_towns.pdf') + extractor = Stream(Pdf(pdfname, pagenos=[{'start': 1, 'end': 1}], clean=True)) tables = extractor.get_tables() - assert_equal(tables['pg-3'][0], data) + assert_equal(tables['page-1'][0], data) def test_stream_ncolumns(): data = [ - ["","","","",""], - ["","Bhandara - Key Indicators","","",""], + ["Bhandara - Key Indicators","","","",""], ["","DLHS-4 (2012-13)","","DLHS-3 (2007-08)",""], ["Indicators","TOTAL","RURAL","TOTAL","RURAL"], ["Reported Prevalence of Morbidity","","","",""], @@ -105,21 +110,20 @@ def test_stream_ncolumns(): ["Moderately High (Systolic >160 mm of Hg & Diastolic >100 mm of Hg ) .....................................","8.2","7.1","",""], ["Very High (Systolic >180 mm of Hg & Diastolic >110 mm of Hg ) ...............................................","3.7","3.1","",""], ["14 Any anaemia below 11g/dl, severe anaemia below 7g/dl. 15 Excluding age group 19 years","","","",""], - ["","Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","",""] + ["Chronic Illness :Any person with symptoms persisting for longer than one month is defined as suffering from chronic illness","","","",""], + ["","4","","",""] ] pdfname = os.path.join(testdir, 'missing_values.pdf') extractor = Stream(Pdf(pdfname, char_margin=1.0, clean=True), ncolumns=5) tables = extractor.get_tables() - assert_equal(tables['pg-1'][0], data) + assert_equal(tables['page-1'][0], data) def test_stream_columns(): data = [ - ["","","","","",""], - ["Clave","","Clave","","Clave",""], - ["","Nombre Entidad","","Nombre Municipio","","Nombre Localidad"], + ["Clave","Nombre Entidad","Clave","Nombre Municipio","Clave","Nombre Localidad"], ["Entidad","","Municipio","","Localidad",""], ["01","Aguascalientes","001","Aguascalientes","0094","Granja Adelita"], ["01","Aguascalientes","001","Aguascalientes","0096","Agua Azul"], @@ -160,10 +164,11 @@ def test_stream_columns(): ["01","Aguascalientes","001","Aguascalientes","0209","La Fortuna"], ["01","Aguascalientes","001","Aguascalientes","0212","Lomas del Gachupín"], ["01","Aguascalientes","001","Aguascalientes","0213","El Carmen (Gallinas Güeras) [Rancho]"], - ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"] + ["01","Aguascalientes","001","Aguascalientes","0216","La Gloria"], + ["01","Aguascalientes","001","Aguascalientes","0226","Hacienda Nueva"], ] pdfname = os.path.join(testdir, 'mexican_towns.pdf') extractor = Stream(Pdf(pdfname, clean=True), columns='28,67,180,230,425,475,700') tables = extractor.get_tables() - assert_equal(tables['pg-1'][0], data) \ No newline at end of file + assert_equal(tables['page-1'][0], data) \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index ffed316..dd1c87b 100755 --- a/tools/camelot +++ b/tools/camelot @@ -4,8 +4,12 @@ import os import sys import time import logging +import warnings +import numpy as np from docopt import docopt +from collections import Counter +import matplotlib.pyplot as plt from PyPDF2 import PdfFileReader from camelot.pdf import Pdf @@ -22,12 +26,23 @@ usage: options: -h, --help Show this screen. -v, --version Show version. + -V, --verbose Verbose. -p, --pages Comma-separated list of page numbers. Example: -p 1,3-6,10 [default: 1] + -P, --parallel Parallelize the parsing process. -f, --format Output format. (csv,tsv,html,json,xlsx) [default: csv] - -l, --log Print log to file. - -V, --verbose Verbose. + -l, --log Log to file. -o, --output Output directory. + -M, --cmargin Char margin. Chars closer than cmargin are + grouped together to form a word. [default: 2.0] + -L, --lmargin Line margin. Lines closer than lmargin are + grouped together to form a textbox. [default: 0.5] + -W, --wmargin Word margin. Insert blank spaces between chars + if distance between words is greater than word + margin. [default: 0.1] + -S, --save-info Save parsing info for each page to a file. + -X, --plot Plot distributions. (page,all,rc) + -Z, --summary Summarize metrics. camelot methods: lattice Looks for lines between data. @@ -47,12 +62,12 @@ options: cells. Example: -F h, -F v, -F hv -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] + -i, --invert Invert pdf image to make sure that lines are + in foreground. -j, --jtol Tolerance to account for when comparing joint and line coordinates. [default: 2] -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] - -i, --invert Invert pdf image to make sure that lines are - in foreground. -d, --debug Debug by visualizing pdf geometry. (contour,line,joint,table) Example: -d table """ @@ -69,17 +84,159 @@ options: Example: -c 10.1,20.2,30.3 -y, --ytol Tolerance to account for when grouping rows together. [default: 2] - -M, --cmargin Char margin. Chars closer than cmargin are - grouped together to form a word. [default: 2.0] - -L, --lmargin Line margin. Lines closer than lmargin are - grouped together to form a textbox. [default: 0.5] - -W, --wmargin Word margin. Insert blank spaces between chars - if distance between words is greater than word - margin. [default: 0.1] + -m, --mtol Tolerance to account for when merging columns + together. [default: 2] -d, --debug Debug by visualizing textboxes. """ +def plot_table_barchart(r, c, p, pno, tno): + row_idx = [i + 1 for i, row in enumerate(r)] + col_idx = [i + 1 for i, col in enumerate(c)] + r_index = np.arange(len(r)) + c_index = np.arange(len(c)) + width = 0.7 + + plt.figure(figsize=(8, 6)) + plt.subplot(2, 1, 1) + plt.title('Percentage of empty cells in table: {0:.2f}'.format(p)) + plt.xlabel('row index') + plt.ylabel('number of non-empty cells in row') + plt.bar(r_index, r) + plt.xticks(r_index + width * 0.5, row_idx) + plt.ylim(0, len(c)) + + plt.subplot(2, 1, 2) + plt.xlabel('column index') + plt.ylabel('number of non-empty cells in column') + plt.bar(c_index, c) + plt.xticks(c_index + width * 0.5, col_idx) + plt.ylim(0, len(r)) + plt.savefig(''.join([pno, '_', tno, '.png']), dpi=300) + + +def plot_all_barchart(data, output): + r_empty_cells = [] + for page_number in data.keys(): + page = data[page_number] + for table_number in page.keys(): + table = page[table_number] + r_empty_cells.extend([r / float(table['ncols']) for r in table['r_nempty_cells']]) + c = Counter(r_empty_cells) + if 0.0 not in c: + c.update({0.0: 0}) + if 1.0 not in c: + c.update({1.0: 0}) + + plt.figure(figsize=(8, 6)) + plt.xlabel('percentage of non-empty cells in a row') + plt.ylabel('percentage of rows processed') + row_p = [count / float(sum(c.values())) for count in c.values()] + plt.bar(c.keys(), row_p, align='center', width=0.05) + plt.ylim(0, 1.0) + plt.savefig(''.join([output, '_all.png']), dpi=300) + + +def plot_rc_piechart(data, output): + from matplotlib import cm + + tables = 0 + rows, cols = [], [] + for page_number in data.keys(): + page = data[page_number] + for table_number in page.keys(): + table = page[table_number] + tables += 1 + rows.append(table['nrows']) + cols.append(table['ncols']) + + r = Counter(rows) + c = Counter(cols) + + plt.figure(figsize=(8, 6)) + cs1 = cm.Set1(np.arange(len(r)) / float(len(r))) + ax1 = plt.subplot(211, aspect='equal') + ax1.pie(r.values(), colors=cs1, labels=r.keys(), startangle=90) + ax1.set_title('row distribution across tables') + + cs2 = cm.Set1(np.arange(len(c)) / float(len(c))) + ax2 = plt.subplot(212, aspect='equal') + ax2.pie(c.values(), colors=cs2, labels=c.keys(), startangle=90) + ax2.set_title('column distribution across tables') + plt.savefig(''.join([output, '_rc.png']), dpi=300) + + +def summary(data, p_time): + from operator import itemgetter + from itertools import groupby + + scores = [] + continuous_tables = [] + total_tables = 0 + for page_number in data.keys(): + page = data[page_number] + total_tables += len(page.keys()) + for table_number in page.keys(): + table = page[table_number] + continuous_tables.append((page_number, table_number, table['ncols'])) + scores.append(table['score']) + avg_score = np.mean(scores) + + ct_pages = [] + header_string = "" + if len(continuous_tables) > 1: + tables = sorted(continuous_tables, key=lambda x: (int(x[0][5:]), int(x[1][6:]))) + for k, g in groupby(tables, key=itemgetter(2)): + g = list(g) + tables_same_ncols = set([int(t[0][5:]) for t in g]) + tables_same_ncols = sorted(list(tables_same_ncols)) + for K, G in groupby(enumerate(tables_same_ncols), key=lambda (i, x): i - x): + G = list(G) + ct_pages.append((str(G[0][1]), str(G[-1][1]))) + + result_headers = [] + for ct in ct_pages: + header_idx = {} + possible_headers = [] + ncols = 0 + for page_number in range(int(ct[0]), int(ct[1]) + 1): + page = data['page-{0}'.format(page_number)] + for table_number in page.keys(): + table = page[table_number] + ncols = table['ncols'] + for i, row in enumerate(table['data']): + try: + header_idx[tuple(row)].append(i) + except KeyError: + header_idx[tuple(row)] = [i] + possible_headers = sorted(header_idx, key=lambda k: len(header_idx[k]), reverse=True)[:10] + possible_headers = filter(lambda z: len(z) == ncols, + [filter(lambda x: x != '', p_h) for p_h in possible_headers]) + modes = [] + for p_h in possible_headers: + try: + modes.append((p_h, max(set(header_idx[p_h]), key=header_idx[p_h].count))) + except KeyError: + pass + header = modes[modes.index(min(modes, key=lambda x: x[1]))][0] + result_headers.append(header) + + header_string = "Multi-page table headers*:\n" + header_string = ''.join([header_string, '\n'.join(['pages {0} -> {1}{2}{3}'.format( + '-'.join([cr[0][0], cr[0][1]]), '"', '","'.join(cr[1]), '"') for cr in zip( + ct_pages, result_headers)])]) + + avg_time = "Time taken per page: {0:.2f} seconds\n".format( + p_time / float(len(data))) if len(data) != 1 else "" + equal_ncols = "\nMulti-page tables on*: {0}\n".format( + ', '.join(['-'.join(ct) for ct in ct_pages])) if len(data) != 1 else "" + stats = [len(data), p_time, avg_time, total_tables, avg_score, equal_ncols] + stat_string = ("Pages processed: {0}\nTime taken: {1:.2f} seconds\n" + "{2}Tables found: {3}\nAverage score: {4:.2f}{5}".format(*stats)) + + print(''.join([stat_string, header_string])) + + def convert_to_html(table): html = '' html = ''.join([html, '\n']) @@ -99,23 +256,23 @@ def write_to_disk(data, f='csv', output=None, filename=None): if f in ['csv', 'tsv']: import csv delimiter = ',' if f == 'csv' else '\t' - for page in sorted(data): - for table in range(len(data[page])): - dsvname = '{0}_table_{1}.{2}'.format(page, table + 1, f) + for page_number in sorted(data.keys()): + for table_number in sorted(data[page_number].keys()): + dsvname = '{0}.{1}'.format(''.join([page_number, '_', table_number]), f) with open(os.path.join(output, dsvname), 'w') as outfile: writer = csv.writer( outfile, delimiter=delimiter, quoting=csv.QUOTE_ALL) - for row in data[page][table]: + for row in data[page_number][table_number]['data']: writer.writerow(row) elif f == 'html': - htmlname = '{}.html'.format(froot) - for page in sorted(data): - for table in range(len(data[page])): + htmlname = '{0}.html'.format(froot) + for page_number in sorted(data.keys()): + for table_number in sorted(data[page_number].keys()): with open(os.path.join(output, htmlname), 'a') as htmlfile: - htmlfile.write(convert_to_html(data[page][table])) + htmlfile.write(convert_to_html(data[page_number][table_number]['data'])) elif f == 'json': import json - with open(os.path.join(output, '{}.json'.format(froot)), 'w') \ + with open(os.path.join(output, '{0}.json'.format(froot)), 'w') \ as jsonfile: json.dump(data, jsonfile) elif f == 'xlsx': @@ -123,12 +280,12 @@ def write_to_disk(data, f='csv', output=None, filename=None): from pyexcel_xlsx import save_data from collections import OrderedDict xlsx_data = OrderedDict() - for page in sorted(data): - for table in range(len(data[page])): - sheet_name = '{0}_table_{1}'.format(page, table + 1) + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + for table_number in sorted(data[page_number].keys(), key=lambda x: int(x[6:])): + sheet_name = ''.join([page_number, '_', table_number]) xlsx_data.update({sheet_name: - [row for row in data[page][table]]}) - save_data(os.path.join(output, '{}.xlsx'.format(froot)), xlsx_data) + [row for row in data[page_number][table_number]['data']]}) + save_data(os.path.join(output, '{0}.xlsx'.format(froot)), xlsx_data) except ImportError: print("link to install docs") @@ -147,16 +304,17 @@ if __name__ == '__main__': filename = args[''] filedir = os.path.dirname(args['']) logname, __ = os.path.splitext(filename) - logname += '.log' + logname = ''.join([logname, '.log']) + scorename, __ = os.path.splitext(filename) + scorename = ''.join([scorename, '_info.csv']) + pngname, __ = os.path.splitext(filename) if args['--log']: + FORMAT = '%(asctime)s - %(levelname)s - %(message)s' if args['--output']: logname = os.path.join(args['--output'], os.path.basename(logname)) - logging.basicConfig( - filename=logname, filemode='w', level=logging.DEBUG) - else: - logging.basicConfig( - filename=logname, filemode='w', level=logging.DEBUG) + logging.basicConfig( + filename=logname, filemode='w', format=FORMAT, level=logging.DEBUG) p = [] if args['--pages'] == '1': @@ -173,47 +331,142 @@ if __name__ == '__main__': else: p.append({'start': int(r), 'end': int(r)}) + margin_tuple = (float(args['--cmargin']), float(args['--lmargin']), + float(args['--wmargin'])) if args[''] == 'lattice': try: - extractor = Lattice(Pdf(filename, pagenos=p, clean=True), - fill=args['--fill'], - scale=int(args['--scale']), - jtol=int(args['--jtol']), - mtol=int(args['--mtol']), - invert=args['--invert'], - debug=args['--debug'], - verbose=args['--verbose']) - data = extractor.get_tables() + manager = Pdf(Lattice( + fill=args['--fill'], + scale=int(args['--scale']), + invert=args['--invert'], + jtol=int(args['--jtol']), + mtol=int(args['--mtol']), + pdf_margin=margin_tuple, + debug=args['--debug']), + filename, + pagenos=p, + parallel=args['--parallel'], + clean=True) + data = manager.extract() + + processing_time = time.time() - start_time + vprint("Finished processing in", processing_time, "seconds") + logging.info("Finished processing in " + str(processing_time) + " seconds") + + if args['--plot']: + if args['--output']: + pngname = os.path.join(args['--output'], os.path.basename(pngname)) + plot_type = args['--plot'].split(',') + if 'page' in plot_type: + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + plot_table_barchart(table['r_nempty_cells'], + table['c_nempty_cells'], + table['empty_p'], + page_number, + table_number) + + if 'all' in plot_type: + plot_all_barchart(data, pngname) + + if 'rc' in plot_type: + plot_rc_piechart(data, pngname) + + if args['--summary']: + summary(data, processing_time) + + if args['--save-info']: + if args['--output']: + scorename = os.path.join(args['--output'], os.path.basename(scorename)) + with open(scorename, 'w') as score_file: + score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( + ''.join([page_number, '_', table_number]), + table['nrows'], + table['ncols'], + table['empty_p'], + table['line_p'], + table['text_p'], + table['score'])) if args['--debug']: - extractor.plot_geometry(args['--debug']) + manager.debug_plot() except Exception as e: logging.exception(e.message, exc_info=True) sys.exit() elif args[''] == 'stream': try: - extractor = Stream(Pdf(filename, pagenos=p, - char_margin=float(args['--cmargin']), - line_margin=float(args['--lmargin']), - word_margin=float(args['--wmargin']), - clean=True), - ncolumns=int(args['--ncols']), - columns=args['--columns'], - ytol=int(args['--ytol']), - debug=args['--debug'], - verbose=args['--verbose']) - data = extractor.get_tables() + manager = Pdf(Stream( + ncolumns=int(args['--ncols']), + columns=args['--columns'], + ytol=int(args['--ytol']), + mtol=int(args['--mtol']), + pdf_margin=margin_tuple, + debug=args['--debug']), + filename, + pagenos=p, + parallel=args['--parallel'], + clean=True) + data = manager.extract() + + processing_time = time.time() - start_time + vprint("Finished processing in", processing_time, "seconds") + logging.info("Finished processing in " + str(processing_time) + " seconds") + + if args['--plot']: + if args['--output']: + pngname = os.path.join(args['--output'], os.path.basename(pngname)) + plot_type = args['--plot'].split(',') + if 'page' in plot_type: + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + plot_table_barchart(table['r_nempty_cells'], + table['c_nempty_cells'], + table['empty_p'], + page_number, + table_number) + + if 'all' in plot_type: + plot_all_barchart(data, pngname) + + if 'rc' in plot_type: + plot_rc_piechart(data, pngname) + + if args['--summary']: + summary(data, processing_time) + + if args['--save-info']: + if args['--output']: + scorename = os.path.join(args['--output'], os.path.basename(scorename)) + with open(scorename, 'w') as score_file: + score_file.write('table,nrows,ncols,empty_p,,score\n') + for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): + page = data[page_number] + for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): + table = page[table_number] + score_file.write('{0},{1},{2},{3},{4}\n'.format( + ''.join([page_number, '_', table_number]), + table['nrows'], + table['ncols'], + table['empty_p'], + table['score'])) + if args['--debug']: - extractor.plot_text() + manager.debug_plot() except Exception as e: logging.exception(e.message, exc_info=True) sys.exit() - if data is None: + if args['--debug']: print("See 'camelot -h' for various parameters you can tweak.") else: output = filedir if args['--output'] is None else args['--output'] write_to_disk(data, f=args['--format'], output=output, filename=filename) - - vprint("finished in", time.time() - start_time, "seconds") - logging.info("Time taken: " + str(time.time() - start_time) + " seconds")