diff --git a/.gitignore b/.gitignore index fefd514..e5bdc6b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__/ *.py[cod] -.camelot/ +*.so + +.camelot/ \ No newline at end of file diff --git a/README.md b/README.md index 1a59793..e6712e7 100644 --- a/README.md +++ b/README.md @@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p ## Usage -python2 camelot.py [options] file +
+camelot.py [options]  [...]
 
-positional arguments:
+options:
+ -h, --help                Show this screen.
+ -v, --version             Show version.
+ -p, --pages <pageno>      Comma-separated list of page numbers.
+                           Example: -p 1,3-6,10  [default: 1]
+ -f, --format <format>     Output format. (csv,xlsx) [default: csv]
+ -l, --log                 Print log to file.
+ -o, --output <directory>  Output directory.
 
-  file
+camelot methods:
+ lattice  Looks for lines between data.
+ stream   Looks for spaces between data.
 
-optional arguments:
-
-  -h, --help
-
-    show this help message and exit
-
-  -p, --pages PAGES [PAGES ...]
-
-    Specify the page numbers and/or page ranges to be
-    parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-
-  -f, --format FORMAT
-
-    Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-
-  -m, --spreadsheet
-
-    Extract tables with ruling lines. (default: False)
-
-  -F, --fill FILL
-
-    Fill the values in empty cells horizontally(h) and/or
-    vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-
-  -s, --scale [SCALE]
-
-    Scaling factor. Large scaling factor leads to smaller
-    lines being detected. (default: 15)
-
-  -j, --jtol [JTOL]
-
-    Tolerance to account for when comparing joint and line
-    coordinates. (default: 2)
-
-  -M, --mtol [MTOL]
-
-    Tolerance to account for when merging lines which are
-    very close. (default: 2)
-
-  -i, --invert
-
-    Make sure lines are in foreground. (default: False)
-
-  -d, --debug DEBUG
-
-    Debug by visualizing contours, lines, joints, tables.
-    Example: --debug="contours"
-
-  -o, --output OUTPUT
-
-    Specify output directory.
+See 'camelot  -h' for more information on a specific method.
+
## Development diff --git a/basic.py b/basic.py deleted file mode 100644 index e2ff777..0000000 --- a/basic.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import csv -import numpy as np - -from pdf import get_pdf_info - - -def overlap(l): - merged = [] - for higher in l: - if not merged: - merged.append(higher) - else: - lower = merged[-1] - if higher[0] <= lower[1]: - upper_bound = max(lower[1], higher[1]) - lower_bound = min(lower[0], higher[0]) - merged[-1] = (lower_bound, upper_bound) - else: - merged.append(higher) - return merged - - -def get_row_idx(t, rows): - for r in range(len(rows)): - if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]: - return r - - -def get_column_idx(t, columns): - for c in range(len(columns)): - if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]: - return c - - -def basic(pdf_dir, filename, char_margin, line_margin, word_margin): - print "working on", filename - text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic', - char_margin, line_margin, word_margin) - text.sort(key=lambda x: (-x.y0, x.x0)) - y_last = 0 - data = [] - temp = [] - elements = [] - for t in text: - # is checking for upright necessary? - # if t.get_text().strip() and all([obj.upright for obj in t._objs if - # type(obj) is LTChar]): - if t.get_text().strip(): - if not np.isclose(y_last, t.y0, atol=2): - y_last = t.y0 - elements.append(len(temp)) - data.append(temp) - temp = [] - temp.append(t) - # a table can't have just 1 column, can it? - elements = filter(lambda x: x != 1, elements) - # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count) - mode = max(set(elements), key=elements.count) - columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode] - columns = overlap(sorted(columns)) - columns = [(c[0] + c[1]) / 2.0 for c in columns] - - output = [['' for c in columns] for d in data] - for row, d in enumerate(data): - for t in d: - cog = (t.x0 + t.x1) / 2.0 - diff = [(i, abs(cog - c)) for i, c in enumerate(columns)] - idx = min(diff, key=lambda x: x[1]) - if output[row][idx[0]]: - output[row][idx[0]] += ' ' + t.get_text().strip() - else: - output[row][idx[0]] = t.get_text().strip() - - csvname = filename.split('.')[0] + '.csv' - csvpath = os.path.join(pdf_dir, csvname) - with open(csvpath, 'w') as outfile: - writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) - for row in output: - writer.writerow([cell.encode('utf-8') for cell in row]) diff --git a/camelot.py b/camelot.py old mode 100644 new mode 100755 index 7efd914..44e9dee --- a/camelot.py +++ b/camelot.py @@ -1,136 +1,258 @@ +#!/usr/bin/env python2 import os import re +import csv +import sys import glob import time import shutil import logging +import zipfile +import tempfile import subprocess -import argparse +from docopt import docopt +from werkzeug.utils import secure_filename -from basic import basic -from spreadsheet import spreadsheet +from lattice import lattice +from stream import stream + + +doc = """ +camelot parses tables from PDFs! + +usage: + camelot.py [options] [...] + +options: + -h, --help Show this screen. + -v, --version Show version. + -p, --pages Comma-separated list of page numbers. + Example: -p 1,3-6,10 [default: 1] + -f, --format Output format. (csv,xlsx) [default: csv] + -l, --log Print log to file. + -o, --output Output directory. + +camelot methods: + lattice Looks for lines between data. + stream Looks for spaces between data. + +See 'camelot -h' for more information on a specific method. +""" + +lattice_doc = """ +Lattice method looks for lines between data to form a table. + +usage: + camelot.py lattice [options] [--] + +options: + -F, --fill Fill data in horizontal and/or vertical spanning + cells. Example: -F h, -F v, -F hv + -s, --scale Scaling factor. Large scaling factor leads to + smaller lines being detected. [default: 15] + -j, --jtol Tolerance to account for when comparing joint + and line coordinates. [default: 2] + -m, --mtol Tolerance to account for when merging lines + which are very close. [default: 2] + -i, --invert Invert pdf image to make sure that lines are + in foreground. + -d, --debug Debug by visualizing pdf geometry. + (contour,line,joint,table) Example: -d table +""" + +stream_doc = """ +Stream method looks for spaces between data to form a table. + +usage: + camelot.py stream [options] [--] + +options: + -n, --ncols Number of columns. [default: 0] + -c, --columns Comma-separated list of column x-coordinates. + Example: -c 10.1,20.2,30.3 + -M, --cmargin Char margin. Chars closer than cmargin are + grouped together to form a word. [default: 2.0] + -L, --lmargin Line margin. Lines closer than lmargin are + grouped together to form a textbox. [default: 0.5] + -W, --wmargin Word margin. Insert blank spaces between chars + if distance between words is greater than word + margin. [default: 0.1] + -d, --debug Debug by visualizing textboxes. +""" pno = re.compile(r'\d+') -def mkdir(directory): - if not os.path.isdir(directory): - os.makedirs(directory) - - -def filesort(filename): - filename = filename.split('/')[-1] +def filesort(filepath): + filename = os.path.basename(filepath) num = pno.findall(filename) if len(num) == 2: return (int(num[0]), int(num[1])) else: return (int(num[0]), 0) -start_time = time.time() -CAMELOT_DIR = '.camelot/' -mkdir(CAMELOT_DIR) -parser = argparse.ArgumentParser( - description='Parse tables from pdfs!', usage='python2 camelot.py [options] file') -parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages', - help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)') -parser.add_argument('-f', '--format', nargs=1, action='store', dest='format', - help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"]) -parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet', - help='Extract tables with ruling lines. (default: False)') -parser.add_argument('-i', '--fill', action='store', dest='fill', - help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None) -parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale', - help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int) -parser.add_argument('-j', '--jtol', nargs='?', action='store', - dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int) -parser.add_argument('-t', '--mtol', nargs='?', action='store', - dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int) -parser.add_argument('-n', '--invert', action='store_true', dest='invert', - help='Make sure lines are in foreground. (default: False)') -parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug', - help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"') -parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin', - help='(default: 2.0)', default=2.0, type=float) -parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin', - help='(default: 0.5)', default=0.5, type=float) -parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin', - help='(default: 0.1)', default=0.1, type=float) -parser.add_argument('-o', '--output', nargs=1, action='store', dest='output', - help='Specify output directory.') -parser.add_argument('file', nargs=1) +if __name__ == '__main__': + start_time = time.time() + tmpdir = tempfile.mkdtemp() -result = parser.parse_args() + args = docopt(doc, version='0.1', options_first=True) + argv = [args['']] + args[''] + if args[''] == 'lattice': + args.update(docopt(lattice_doc, argv=argv)) + elif args[''] == 'stream': + args.update(docopt(stream_doc, argv=argv)) -if result.pages: - if result.pages == ['all']: - p = result.pages + if args['--pages']: + if args['--pages'] == ['all']: + p = args['--pages'] + else: + p = [] + for r in args['--pages'].split(','): + if '-' in r: + a, b = r.split('-') + a, b = int(a), int(b) + p.extend([str(i) for i in range(a, b + 1)]) + else: + p.extend([str(r)]) else: - p = [] - for r in result.pages[0].split(' '): - if '-' in r: - a, b = r.split('-') - a, b = int(a), int(b) - p.extend([str(i) for i in range(a, b + 1)]) - else: - p.extend([str(r)]) -else: - p = ['1'] -p = sorted(set(p)) + p = ['1'] + p = sorted(set(p)) -filename = result.file[0].split('/')[-1] -# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex')) -pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0]) -mkdir(pdf_dir) -logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[ - 0] + '.log'), filemode='w', level=logging.DEBUG) + fname = os.path.basename(args['']) + fname = secure_filename(fname) + fdir = os.path.dirname(args['']) + froot, fext = os.path.splitext(fname) + if fext.lower() != '.pdf': + print "camelot can parse only pdfs right now" + sys.exit() -shutil.copy(result.file[0], os.path.join(pdf_dir, filename)) -print "separating pdf into pages" -print -if p == ['all']: - subprocess.call(['pdfseparate', os.path.join( - pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')]) -else: - for page in p: - subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join( - pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')]) + logfname = os.path.join(tmpdir, froot + '.log') + logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG) -if result.spreadsheet: - print "using the spreadsheet method" - for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): - print "converting", g.split('/')[-1], "to image" - os.system(' '.join(['convert', '-density', '300', - g, '-depth', '8', g[:-4] + '.png'])) - try: - spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale, - result.jtol, result.mtol, result.invert, result.debug, - result.char_margin, result.line_margin, result.word_margin) - except: - logging.error("Couldn't parse " + g.split('/')[-1]) - print "Couldn't parse", g.split('/')[-1] -else: - print "using the basic method" - for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))): - basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin) - -if result.format == ['xlsx']: - import csv - from pyexcel_xlsx import save_data - from collections import OrderedDict - data = OrderedDict() - for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort): - print "adding", c.split('/')[-1], "to excel file" - with open(c, 'r') as csvfile: - reader = csv.reader(csvfile) - data.update({c.split('/')[-1].split('.') - [0]: [row for row in reader]}) - xlsxname = filename.split('.')[0] + '.xlsx' - xlsxpath = os.path.join(pdf_dir, xlsxname) - save_data(xlsxpath, data) + shutil.copy(args[''], os.path.join(tmpdir, fname)) + print "separating pdf into pages" print - print "saved as", xlsxname + if p == ['all']: + subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir, + 'pg-%d.pdf')]) + else: + for page in p: + subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname), + os.path.join(tmpdir, 'pg-%s.pdf' % page)]) -print "finished in", time.time() - start_time, "seconds" -logging.info("Time taken for " + filename + ": " + - str(time.time() - start_time) + " seconds") + glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf'))) + if args[''] == 'lattice': + print "using the lattice method" + for g in glob_pdf: + g_fname = os.path.basename(g) + g_froot, __ = os.path.splitext(g) + print "converting %s to image" % g_fname + os.system(' '.join(['convert', '-density', '300', + g, '-depth', '8', g_froot + '.png'])) + try: + data = lattice(g, f=args['--fill'], s=int(args['--scale']), + jtol=int(args['--jtol']), mtol=int(args['--mtol']), + invert=args['--invert'], debug=args['--debug']) + if data is None: + print + print "See 'camelot lattice -h' for various parameters you can tweak." + sys.exit() + for k in sorted(data.keys()): + csvfile = g_froot + '_%s.csv' % k + with open(csvfile, 'w') as outfile: + writer = csv.writer(outfile) + for d in data[k]: + writer.writerow([c.encode('utf-8') for c in d]) + print "saved as", os.path.basename(csvfile) + print + except Exception: + logging.exception("") + print "couldn't parse", g_fname, "see log for more info" + print + elif args[''] == 'stream': + print "using the stream method" + for g in glob_pdf: + g_fname = os.path.basename(g) + g_froot, __ = os.path.splitext(g) + try: + data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'], + char_margin=float(args['--cmargin']), + line_margin=float(args['--lmargin']), + word_margin=float(args['--wmargin']), + debug=args['--debug']) + if data is None: + print + print "See 'camelot stream -h' for various parameters you can tweak." + sys.exit() + csvfile = g_froot + '.csv' + with open(csvfile, 'w') as outfile: + writer = csv.writer(outfile) + for d in data: + writer.writerow([c.encode('utf-8') for c in d]) + print "saved as", os.path.basename(csvfile) + print + except Exception: + logging.exception("") + print "couldn't parse", g_fname, "see log for more info" + print + + glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort) + if args['--format'] == 'csv': + if len(glob_csv) == 1: + if args['--output']: + shutil.copy(glob_csv[0], args['--output']) + if args['--log']: + shutil.copy(logfname, args['--output']) + else: + shutil.copy(glob_csv[0], fdir) + if args['--log']: + shutil.copy(zippath, fdir) + else: + zipname = froot + '.zip' + zippath = os.path.join(tmpdir, zipname) + print "zipping 'em up" + with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip: + for g in glob_csv: + myzip.write(g, os.path.join(froot, os.path.basename(g))) + if args['--output']: + shutil.copy(zippath, args['--output']) + if args['--log']: + shutil.copy(logfname, args['--output']) + else: + shutil.copy(zippath, fdir) + if args['--log']: + shutil.copy(zippath, fdir) + print + elif args['--format'] == 'xlsx': + from pyexcel_xlsx import save_data + from collections import OrderedDict + data = OrderedDict() + for c in glob_csv: + c_fname = os.path.basename(c) + c_froot, __ = os.path.splitext(c) + print "adding", c_fname, "to excel file" + with open(c, 'r') as csvfile: + reader = csv.reader(csvfile) + c_froot, __ = os.path.splitext(c_fname) + data.update({c_froot: [row for row in reader]}) + xlsxname = froot + '.xlsx' + xlsxpath = os.path.join(tmpdir, xlsxname) + save_data(xlsxpath, data) + if args['--output']: + shutil.copy(xlsxpath, args['--output']) + if args['--log']: + shutil.copy(logfname, args['--output']) + else: + shutil.copy(xlsxpath, fdir) + if args['--log']: + shutil.copy(zippath, fdir) + print + print "saved as", xlsxname + + print "cleaning up..." + shutil.rmtree(tmpdir) + + print "finished in", time.time() - start_time, "seconds" + logging.info("Time taken for " + fname + ": " + + str(time.time() - start_time) + " seconds") diff --git a/cell.py b/cell.py index e2e91cb..ee993ae 100644 --- a/cell.py +++ b/cell.py @@ -1,6 +1,44 @@ class Cell: + """Cell + Parameters + ---------- + x1 : int + + y1 : int + + x2 : int + + y2 : int + + Attributes + ---------- + lb : tuple + + lt : tuple + + rb : tuple + + rt : tuple + + bbox : tuple + + left : bool + + right : bool + + top : bool + + bottom : bool + + text : string + + spanning_h : bool + + spanning_v : bool + """ def __init__(self, x1, y1, x2, y2): + self.lb = (x1, y1) self.lt = (x1, y2) self.rb = (x2, y1) @@ -15,10 +53,28 @@ class Cell: self.spanning_v = False def add_text(self, text): - self.text += text + """Add text to cell object. + + Parameters + ---------- + text : string + """ + self.text = ''.join([self.text, text]) def get_text(self): + """Get text from cell object. + + Returns + ------- + text : string + """ return self.text def get_bounded_edges(self): + """Get number of edges by which a cell is bounded. + + Returns + ------- + bounded_edges : int + """ return self.top + self.bottom + self.left + self.right diff --git a/spreadsheet.py b/lattice.py similarity index 51% rename from spreadsheet.py rename to lattice.py index 46ea466..2395dd7 100644 --- a/spreadsheet.py +++ b/lattice.py @@ -1,37 +1,173 @@ import os -import csv import cv2 import glob import numpy as np from table import Table from pdf import get_pdf_info -from morph_transform import morph_transform from utils import (translate, scale, merge_close_values, get_row_idx, get_column_idx, reduce_index, outline, fill, remove_empty) -def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, - char_margin, line_margin, word_margin): +def morph_transform(img, s=15, invert=False): + """Morphological Transformation + + Applies a series of morphological operations on the image + to find table contours and line segments. + http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ + + Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2 + taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf + + Parameters + ---------- + img : ndarray + + s : int, default: 15, optional + Scaling factor. Large scaling factor leads to smaller lines + being detected. + + invert : bool, default: False, optional + Invert pdf image to make sure that lines are in foreground. + + Returns + ------- + tables : dict + Dictionary with table bounding box as key and list of + joints found in the table as value. + + v_segments : list + List of vertical line segments found in the image. + + h_segments : list + List of horizontal line segments found in the image. + """ + img_x, img_y = img.shape[1], img.shape[0] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + if invert: + threshold = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) + else: + threshold = cv2.adaptiveThreshold(np.invert( + gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) + vertical = threshold + horizontal = threshold + + scale = s + verticalsize = vertical.shape[0] / scale + horizontalsize = horizontal.shape[1] / scale + + ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) + hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) + + vertical = cv2.erode(vertical, ver, (-1, -1)) + vertical = cv2.dilate(vertical, ver, (-1, -1)) + + horizontal = cv2.erode(horizontal, hor, (-1, -1)) + horizontal = cv2.dilate(horizontal, hor, (-1, -1)) + + mask = vertical + horizontal + joints = np.bitwise_and(vertical, horizontal) + __, contours, __ = cv2.findContours( + mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] + + tables = {} + for c in contours: + c_poly = cv2.approxPolyDP(c, 3, True) + x, y, w, h = cv2.boundingRect(c_poly) + # find number of non-zero values in joints using what boundingRect + # returns + roi = joints[y : y + h, x : x + w] + __, jc, __ = cv2.findContours( + roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + if len(jc) <= 4: # remove contours with less than <=4 joints + continue + joint_coords = [] + for j in jc: + jx, jy, jw, jh = cv2.boundingRect(j) + c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 + joint_coords.append((c1, c2)) + tables[(x, y + h, x + w, y)] = joint_coords + + v_segments, h_segments = [], [] + _, vcontours, _ = cv2.findContours( + vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for vc in vcontours: + x, y, w, h = cv2.boundingRect(vc) + x1, x2 = x, x + w + y1, y2 = y, y + h + v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) + + _, hcontours, _ = cv2.findContours( + horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + for hc in hcontours: + x, y, w, h = cv2.boundingRect(hc) + x1, x2 = x, x + w + y1, y2 = y, y + h + h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) + + return tables, v_segments, h_segments + + +def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None): + """Lattice algorithm + + Makes table using pdf geometry information returned by + morph_transform and fills data returned by PDFMiner in table cells. + + Parameters + ---------- + filepath : string + + f : string, default: None, optional + Fill data in horizontal and/or vertical spanning + cells. ('h', 'v', 'hv') + + s : int, default: 15, optional + Scaling factor. Large scaling factor leads to smaller lines + being detected. + + jtol : int, default: 2, optional + Tolerance to account for when comparing joint and line + coordinates. + + mtol : int, default: 2, optional + Tolerance to account for when merging lines which are + very close. + + invert : bool, default: False, optional + Invert pdf image to make sure that lines are in foreground. + + debug : string + Debug by visualizing pdf geometry. + ('contour', 'line', 'joint', 'table') + Returns + ------- + output : dict + Dictionary with table number as key and list of data as value. + """ if debug: import matplotlib.pyplot as plt - import matplotlib.patches as patches + filename = os.path.basename(filepath) print "working on", filename - imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png') + fileroot, __ = os.path.splitext(filepath) + imagename = fileroot + '.png' img = cv2.imread(imagename) img_x, img_y = img.shape[1], img.shape[0] - text, pdf_x, pdf_y = get_pdf_info( - os.path.join(pdf_dir, filename), 'spreadsheet', - char_margin, line_margin, word_margin) + text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice') scaling_factor_x = pdf_x / float(img_x) scaling_factor_y = pdf_y / float(img_y) - tables, v_segments, h_segments = morph_transform(imagename, s, invert) + tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert) - if debug == ["contours"]: + if debug == "contour": for t in tables.keys(): cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3) plt.imshow(img) - if debug == ["joints"]: + plt.show() + return None + if debug == "joint": x_coord = [] y_coord = [] for k in tables.keys(): @@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, plt.plot(x_coord, y_coord, 'ro') plt.axis([0, max_x + 100, max_y + 100, 0]) plt.imshow(img) + plt.show() + return None # detect if vertical num_v = [t for t in text if (not t.upright) and t.get_text().strip()] @@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, abs(translate(-img_y, h[3])), scaling_factor_y) h_segments_new.append((x1, y1, x2, y2)) - num_tables = 0 + num_tables = 1 + output = {} # sort tables based on y-coord for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True): # find rows and columns that lie in table @@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2] < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2] - if debug == ["lines"]: + if debug == "line": for v in v_s: plt.plot([v[0], v[2]], [v[1], v[3]]) for h in h_s: plt.plot([h[0], h[2]], [h[1], h[3]]) + plt.show() + return None columns, rows = zip(*tables_new[k]) columns, rows = list(columns), list(rows) columns.extend([lb[0], rt[0]]) rows.extend([lb[1], rt[1]]) # sort horizontal and vertical segments - columns = merge_close_values(sorted(columns), mtol) - rows = merge_close_values(sorted(rows, reverse=True), mtol) + columns = merge_close_values(sorted(columns), mtol=mtol) + rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol) # make grid using x and y coord of shortlisted rows and columns columns = [(columns[i], columns[i + 1]) for i in range(0, len(columns) - 1)] @@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, table = Table(columns, rows) # light up cell edges - table = table.set_edges(v_s, h_s, jtol) + table = table.set_edges(v_s, h_s, jtol=jtol) # table set span method table = table.set_spanning() - # TODO + # light up table border table = outline(table) - if debug == ["tables"]: + if debug == "table": for i in range(len(table.cells)): for j in range(len(table.cells[i])): if table.cells[i][j].left: @@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, if table.cells[i][j].bottom: plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]], [table.cells[i][j].lb[1], table.cells[i][j].rb[1]]) - if debug: plt.show() + return None # fill text after sorting it if not rotated: @@ -152,26 +293,20 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug, r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx) table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n')) - if fill: - table = fill(table, fill) + if f is not None: + table = fill(table, f=f) data = [] for i in range(len(table.cells)): data.append([table.cells[i][j].get_text().strip().encode('utf-8') - for j in range(len(table.cells[i]))]) + for j in range(len(table.cells[i]))]) if rotated == 'left': data = zip(*data[::-1]) elif rotated == 'right': data = zip(*data[::1]) data.reverse() data = remove_empty(data) - csvname = filename.split( - '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv' - csvpath = os.path.join(pdf_dir, csvname) - with open(csvpath, 'w') as outfile: - writer = csv.writer(outfile, quoting=csv.QUOTE_ALL) - for d in data: - writer.writerow(d) - print "saved as", csvname - print + output['table_%d' % num_tables] = data num_tables += 1 + + return output \ No newline at end of file diff --git a/morph_transform.py b/morph_transform.py deleted file mode 100644 index 09f0c16..0000000 --- a/morph_transform.py +++ /dev/null @@ -1,75 +0,0 @@ -import cv2 -import numpy as np - - -def morph_transform(imagename, s, invert): - # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/ - img = cv2.imread(imagename) - img_x, img_y = img.shape[1], img.shape[0] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - # empirical result taken from - # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf - if invert: - threshold = cv2.adaptiveThreshold( - gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) - else: - threshold = cv2.adaptiveThreshold(np.invert( - gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2) - vertical = threshold - horizontal = threshold - - scale = s - verticalsize = vertical.shape[0] / scale - horizontalsize = horizontal.shape[1] / scale - - ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1)) - - vertical = cv2.erode(vertical, ver, (-1, -1)) - vertical = cv2.dilate(vertical, ver, (-1, -1)) - - horizontal = cv2.erode(horizontal, hor, (-1, -1)) - horizontal = cv2.dilate(horizontal, hor, (-1, -1)) - - mask = vertical + horizontal - joints = np.bitwise_and(vertical, horizontal) - _, contours, _ = cv2.findContours( - mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10] - - tables = {} - for c in contours: - c_poly = cv2.approxPolyDP(c, 3, True) - x, y, w, h = cv2.boundingRect(c_poly) - # find number of non-zero values in joints using what boundingRect - # returns - roi = joints[y:y + h, x:x + w] - _, jc, _ = cv2.findContours( - roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) - if len(jc) <= 4: # remove contours with less than <=4 joints - continue - joint_coords = [] - for j in jc: - jx, jy, jw, jh = cv2.boundingRect(j) - c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2 - joint_coords.append((c1, c2)) - tables[(x, y + h, x + w, y)] = joint_coords - - v_segments, h_segments = [], [] - _, vcontours, _ = cv2.findContours( - vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for vc in vcontours: - x, y, w, h = cv2.boundingRect(vc) - x1, x2 = x, x + w - y1, y2 = y, y + h - v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1)) - - _, hcontours, _ = cv2.findContours( - horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - for hc in hcontours: - x, y, w, h = cv2.boundingRect(hc) - x1, x2 = x, x + w - y1, y2 = y, y + h - h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2)) - - return tables, v_segments, h_segments diff --git a/pdf.py b/pdf.py index 136904c..b39f185 100644 --- a/pdf.py +++ b/pdf.py @@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal -def parse_text_basic(layout, t=None): +def parse_text_stream(layout, t=None): + """Recursively parse pdf layout to get a list of + LTTextHorizontal objects. + + Parameters + ---------- + layout : object + + t : list + + Returns + ------- + t : list + """ if t is None: t = [] try: for obj in layout._objs: - if type(obj) is LTTextLineHorizontal: + if isinstance(obj, LTTextLineHorizontal): t.append(obj) else: - t += parse_text_basic(obj) + t += parse_text_stream(obj) except AttributeError: pass return t -def parse_text_spreadsheet(layout, t=None): +def parse_text_lattice(layout, t=None): + """Recursively parse pdf layout to get a list of + LTChar objects. + + Parameters + ---------- + layout : object + + t : list + + Returns + ------- + t : list + """ if t is None: t = [] try: for obj in layout._objs: - if type(obj) is LTChar: + if isinstance(obj, LTChar): t.append(obj) else: - t += parse_text_spreadsheet(obj) + t += parse_text_lattice(obj) except AttributeError: pass return t -def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin): +def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5, + word_margin=0.1): + """Get list of text objects along with pdf width and height. + + Parameters + ---------- + pdfname : string + + method : string + + char_margin : float + + line_margin : float + + word_margin : float + + Returns + ------- + text : list + + pdf_x : int + + pdf_y : int + """ + if not method: + return None with open(pdfname, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) @@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin): for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() - if method == 'basic': - text = parse_text_basic(layout) - elif method == 'spreadsheet': - text = parse_text_spreadsheet(layout) + if method == 'stream': + text = parse_text_stream(layout) + elif method == 'lattice': + text = parse_text_lattice(layout) pdf_x, pdf_y = layout.bbox[2], layout.bbox[3] return text, pdf_x, pdf_y diff --git a/stream.py b/stream.py new file mode 100644 index 0000000..13cb8d8 --- /dev/null +++ b/stream.py @@ -0,0 +1,143 @@ +import os +import numpy as np + +from pdf import get_pdf_info + + +def overlap(l): + """Groups overlapping columns and returns list with updated + columns boundaries. + + Parameters + ---------- + l : list + List of column x-coordinates. + + Returns + ------- + merged : list + List of merged column x-coordinates. + """ + merged = [] + for higher in l: + if not merged: + merged.append(higher) + else: + lower = merged[-1] + if higher[0] <= lower[1]: + upper_bound = max(lower[1], higher[1]) + lower_bound = min(lower[0], higher[0]) + merged[-1] = (lower_bound, upper_bound) + else: + merged.append(higher) + return merged + + +def stream(filepath, ncolumns=0, columns=None, char_margin=2.0, + line_margin=0.5, word_margin=0.1, debug=False): + """Stream algorithm + + Groups data returned by PDFMiner into rows and finds mode of the + number of elements in each row to guess number of columns. + + Parameters + ---------- + filepath : string + + ncolumns : int, default: 0, optional + Number of columns. + + columns : string, default: None, optional + Comma-separated list of column x-coordinates. + + char_margin : float, default: 2.0, optional + Char margin. Chars closer than cmargin are grouped together + to form a word. + + line_margin : float, default: 0.5, optional + Line margin. Lines closer than lmargin are grouped together + to form a textbox. + + word_margin : float, default: 0.1, optional + Word margin. Insert blank spaces between chars if distance + between words is greater than word margin. + + debug : bool, default: False, optional + Debug by visualizing textboxes. + + Returns + ------- + output : list + """ + filename = os.path.basename(filepath) + print "working on", filename + text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin, + line_margin=line_margin, word_margin=word_margin) + text.sort(key=lambda x: (-x.y0, x.x0)) + y_last = 0 + data = [] + temp = [] + elements = [] + for t in text: + # is checking for upright necessary? + # if t.get_text().strip() and all([obj.upright for obj in t._objs if + # type(obj) is LTChar]): + if t.get_text().strip(): + if not np.isclose(y_last, t.y0, atol=2): + y_last = t.y0 + elements.append(len(temp)) + data.append(temp) + temp = [] + temp.append(t) + + if debug: + import matplotlib.pyplot as plt + import matplotlib.patches as patches + + fig = plt.figure() + ax = fig.add_subplot(111, aspect='equal') + xs, ys = [], [] + for d in data: + for t in d: + xs.extend([t.x0, t.x1]) + ys.extend([t.y0, t.y1]) + ax.add_patch( + patches.Rectangle( + (t.x0, t.y0), + t.x1 - t.x0, + t.y1 - t.y0 + ) + ) + ax.set_xlim(min(xs) - 10, max(xs) + 10) + ax.set_ylim(min(ys) - 10, max(ys) + 10) + plt.show() + return None + + if columns: + cols = [(float(columns[i]), float(columns[i + 1])) + for i in range(0, len(columns) - 1)] + cols = [(c[0] + c[1]) / 2.0 for c in cols] + else: + # a table can't have just 1 column, can it? + elements = filter(lambda x: x != 1, elements) + mode = ncolumns if ncolumns else max(set(elements), key=elements.count) + cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode] + cols = overlap(sorted(cols)) + cols = [(c[0] + c[1]) / 2.0 for c in cols] + + output = [['' for c in cols] for d in data] + for row, d in enumerate(data): + for t in d: + cog = (t.x0 + t.x1) / 2.0 + diff = [(i, abs(cog - c)) for i, c in enumerate(cols)] + if diff: + idx = min(diff, key=lambda x: x[1]) + else: + print "couldn't find a table on this page" + return None + if output[row][idx[0]]: + output[row][idx[0]] += ' ' + t.get_text().strip() + else: + output[row][idx[0]] = t.get_text().strip() + + return output \ No newline at end of file diff --git a/table.py b/table.py index 3e4e338..d38279c 100644 --- a/table.py +++ b/table.py @@ -4,14 +4,55 @@ from cell import Cell class Table: + """Table + + Parameters + ---------- + columns : list + List of column x-coordinates. + rows : list + List of row y-coordinates. + + Attributes + ---------- + cells : list + 2-D list of cell objects. + + columns : list + List of column x-coordinates. + + rows : list + List of row y-coordinates. + """ def __init__(self, columns, rows): + self.cells = [[Cell(c[0], r[1], c[1], r[0]) for c in columns] for r in rows] self.columns = columns self.rows = rows - def set_edges(self, vertical, horizontal, jtol): + def set_edges(self, vertical, horizontal, jtol=2): + """Set cell edges to True if corresponding line segments + are detected in the pdf image. + + Parameters + ---------- + vertical : list + List of vertical line segments. + + horizontal : list + List of horizontal line segments. + + jtol : int, default: 2, optional + Tolerance to account for when comparing joint and line + coordinates. + + Returns + ------- + self : object + Returns self. + """ for v in vertical: # find closest x coord # iterate over y coords and find closest points @@ -117,6 +158,14 @@ class Table: return self def set_spanning(self): + """Set spanning values of a cell to True if it isn't + bounded by four edges. + + Returns + ------- + self : object + Returns self. + """ for i in range(len(self.cells)): for j in range(len(self.cells[i])): bound = self.cells[i][j].get_bounded_edges() @@ -125,28 +174,38 @@ class Table: elif bound == 3: if not self.cells[i][j].left: - if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom: + if (self.cells[i][j].right and + self.cells[i][j].top and + self.cells[i][j].bottom): self.cells[i][j].spanning_h = True elif not self.cells[i][j].right: - if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom: + if (self.cells[i][j].left and + self.cells[i][j].top and + self.cells[i][j].bottom): self.cells[i][j].spanning_h = True elif not self.cells[i][j].top: - if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom: + if (self.cells[i][j].left and + self.cells[i][j].right and + self.cells[i][j].bottom): self.cells[i][j].spanning_v = True elif not self.cells[i][j].bottom: - if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top: + if (self.cells[i][j].left and + self.cells[i][j].right and + self.cells[i][j].top): self.cells[i][j].spanning_v = True elif bound == 2: if self.cells[i][j].left and self.cells[i][j].right: - if not self.cells[i][j].top and not self.cells[i][j].bottom: + if (not self.cells[i][j].top and + not self.cells[i][j].bottom): self.cells[i][j].spanning_v = True elif self.cells[i][j].top and self.cells[i][j].bottom: - if not self.cells[i][j].left and not self.cells[i][j].right: + if (not self.cells[i][j].left and + not self.cells[i][j].right): self.cells[i][j].spanning_h = True return self diff --git a/utils.py b/utils.py index 46b62cc..89d037c 100644 --- a/utils.py +++ b/utils.py @@ -2,16 +2,61 @@ import numpy as np def translate(x1, x2): + """Translate coordinate x2 by x1. + + Parameters + ---------- + x1 : float + + x2 : float + + Returns + ------- + x2 : float + """ x2 += x1 return x2 def scale(x, s): + """Scale coordinate x by scaling factor s. + + Parameters + ---------- + x : float + + s : float + + Returns + ------- + x : float + """ x *= s return x def rotate(x1, y1, x2, y2, angle): + """Rotate point x2, y2 about point x1, y1 by angle. + + Parameters + ---------- + x1 : float + + y1 : float + + x2 : float + + y2 : float + + angle : float + Angle in radians. + + Returns + ------- + xnew : float + + ynew : float + """ s = np.sin(angle) c = np.cos(angle) x2 = translate(-x1, x2) @@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle): return xnew, ynew -def remove_close_values(ar, mtol): +def remove_close_values(ar, mtol=2): + """Remove values which are within a tolerance of mtol of another value + present in list. + + Parameters + ---------- + ar : list + + mtol : int, default: 2, optional + + Returns + ------- + ret : list + """ ret = [] for a in ar: if not ret: @@ -37,7 +95,20 @@ def remove_close_values(ar, mtol): return ret -def merge_close_values(ar, mtol): +def merge_close_values(ar, mtol=2): + """Merge values which are within a tolerance of mtol by calculating + a moving mean. + + Parameters + ---------- + ar : list + + mtol : int, default: 2, optional + + Returns + ------- + ret : list + """ ret = [] for a in ar: if not ret: @@ -53,18 +124,63 @@ def merge_close_values(ar, mtol): def get_row_idx(t, rows): + """Get index of the row in which the given object falls by + comparing their co-ordinates. + + Parameters + ---------- + t : object + + rows : list + + Returns + ------- + r : int + """ for r in range(len(rows)): if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]: return r def get_column_idx(t, columns): + """Get index of the column in which the given object falls by + comparing their co-ordinates. + + Parameters + ---------- + t : object + + columns : list + + Returns + ------- + c : int + """ for c in range(len(columns)): if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]: return c def reduce_index(t, rotated, r_idx, c_idx): + """Shift a text object if it lies within a spanning cell taking + in account table rotation. + + Parameters + ---------- + t : object + + rotated : string + + r_idx : int + + c_idx : int + + Returns + ------- + r_idx : int + + c_idx : int + """ if not rotated: if t.cells[r_idx][c_idx].spanning_h: while not t.cells[r_idx][c_idx].left: @@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx): def outline(t): + """Light up table boundary. + + Parameters + ---------- + t : object + + Returns + ------- + t : object + """ for i in range(len(t.cells)): t.cells[i][0].left = True t.cells[i][len(t.cells[i]) - 1].right = True @@ -99,7 +225,19 @@ def outline(t): return t -def fill(t, f): +def fill(t, f=None): + """Fill spanning cells. + + Parameters + ---------- + t : object + + f : string, default: None, optional + + Returns + ------- + t : object + """ if f == "h": for i in range(len(t.cells)): for j in range(len(t.cells[i])): @@ -124,6 +262,16 @@ def fill(t, f): def remove_empty(d): + """Remove empty rows and columns. + + Parameters + ---------- + d : list + + Returns + ------- + d : list + """ for i, row in enumerate(d): if row == [''] * len(row): d.pop(i)