Modify command line tool

Precompute globs Replace argparse with docopt Fix CLI Update .gitignore Add docstrings Update README Fix typo Replace zip subprocess call Use tempfile Fix newline
2016-07-19 16:45:28 +05:30 · 2016-07-19 16:45:28 +05:30 · 271d4cafd6
parent 3045a92630
commit 271d4cafd6
11 changed files with 895 additions and 373 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 __pycache__/
 *.py[cod]
 *.so
 .camelot/
--- a/README.md
+++ b/README.md
@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p
 ## Usage
-python2 camelot.py [options] file
+<pre>
 camelot.py [options] <method> [<args>...]
-positional arguments:
+options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages &lt;pageno&gt;      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -f, --format &lt;format&gt;     Output format. (csv,xlsx) [default: csv]
 -l, --log                 Print log to file.
 -o, --output &lt;directory&gt;  Output directory.
-  file
+camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
-optional arguments:
+See 'camelot <method> -h' for more information on a specific method.
-
+</pre>
  -h, --help
    show this help message and exit
  -p, --pages PAGES [PAGES ...]
    Specify the page numbers and/or page ranges to be
    parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
  -f, --format FORMAT
    Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
  -m, --spreadsheet
    Extract tables with ruling lines. (default: False)
  -F, --fill FILL
    Fill the values in empty cells horizontally(h) and/or
    vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
  -s, --scale [SCALE]
    Scaling factor. Large scaling factor leads to smaller
    lines being detected. (default: 15)
  -j, --jtol [JTOL]
    Tolerance to account for when comparing joint and line
    coordinates. (default: 2)
  -M, --mtol [MTOL]
    Tolerance to account for when merging lines which are
    very close. (default: 2)
  -i, --invert
    Make sure lines are in foreground. (default: False)
  -d, --debug DEBUG
    Debug by visualizing contours, lines, joints, tables.
    Example: --debug="contours"
  -o, --output OUTPUT
    Specify output directory.
 ## Development
--- a/basic.py
+++ b/basic.py
@ -1,80 +0,0 @@
 import os
 import csv
 import numpy as np
 from pdf import get_pdf_info
 def overlap(l):
    merged = []
    for higher in l:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                lower_bound = min(lower[0], higher[0])
                merged[-1] = (lower_bound, upper_bound)
            else:
                merged.append(higher)
    return merged
 def get_row_idx(t, rows):
    for r in range(len(rows)):
        if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
            return r
 def get_column_idx(t, columns):
    for c in range(len(columns)):
        if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
            return c
 def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
    print "working on", filename
    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
                              char_margin, line_margin, word_margin)
    text.sort(key=lambda x: (-x.y0, x.x0))
    y_last = 0
    data = []
    temp = []
    elements = []
    for t in text:
        # is checking for upright necessary?
        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(y_last, t.y0, atol=2):
                y_last = t.y0
                elements.append(len(temp))
                data.append(temp)
                temp = []
            temp.append(t)
    # a table can't have just 1 column, can it?
    elements = filter(lambda x: x != 1, elements)
    # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
    mode = max(set(elements), key=elements.count)
    columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
    columns = overlap(sorted(columns))
    columns = [(c[0] + c[1]) / 2.0 for c in columns]
    output = [['' for c in columns] for d in data]
    for row, d in enumerate(data):
        for t in d:
            cog = (t.x0 + t.x1) / 2.0
            diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
            idx = min(diff, key=lambda x: x[1])
            if output[row][idx[0]]:
                output[row][idx[0]] += ' ' + t.get_text().strip()
            else:
                output[row][idx[0]] = t.get_text().strip()
    csvname = filename.split('.')[0] + '.csv'
    csvpath = os.path.join(pdf_dir, csvname)
    with open(csvpath, 'w') as outfile:
        writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
        for row in output:
            writer.writerow([cell.encode('utf-8') for cell in row])
--- a/camelot.py
+++ b/camelot.py
@ -1,136 +1,258 @@
 #!/usr/bin/env python2
 import os
 import re
 import csv
 import sys
 import glob
 import time
 import shutil
 import logging
 import zipfile
 import tempfile
 import subprocess
-import argparse
+from docopt import docopt
 from werkzeug.utils import secure_filename
-from basic import basic
+from lattice import lattice
-from spreadsheet import spreadsheet
+from stream import stream
 doc = """
 camelot parses tables from PDFs!
 usage:
 camelot.py [options] <method> [<args>...]
 options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -p, --pages <pageno>      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -f, --format <format>     Output format. (csv,xlsx) [default: csv]
 -l, --log                 Print log to file.
 -o, --output <directory>  Output directory.
 camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
 See 'camelot <method> -h' for more information on a specific method.
 """
 lattice_doc = """
 Lattice method looks for lines between data to form a table.
 usage:
 camelot.py lattice [options] [--] <file>
 options:
 -F, --fill <fill>      Fill data in horizontal and/or vertical spanning
                        cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>    Scaling factor. Large scaling factor leads to
                        smaller lines being detected. [default: 15]
 -j, --jtol <jtol>      Tolerance to account for when comparing joint
                        and line coordinates. [default: 2]
 -m, --mtol <mtol>      Tolerance to account for when merging lines
                        which are very close. [default: 2]
 -i, --invert           Invert pdf image to make sure that lines are
                        in foreground.
 -d, --debug <debug>    Debug by visualizing pdf geometry.
                        (contour,line,joint,table) Example: -d table
 """
 stream_doc = """
 Stream method looks for spaces between data to form a table.
 usage:
 camelot.py stream [options] [--] <file>
 options:
 -n, --ncols <ncols>      Number of columns. [default: 0]
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
                          grouped together to form a word. [default: 2.0]
 -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
                          grouped together to form a textbox. [default: 0.5]
 -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
                          if distance between words is greater than word
                          margin. [default: 0.1]
 -d, --debug              Debug by visualizing textboxes.
 """
 pno = re.compile(r'\d+')
-def mkdir(directory):
+def filesort(filepath):
-    if not os.path.isdir(directory):
+    filename = os.path.basename(filepath)
        os.makedirs(directory)
 def filesort(filename):
    filename = filename.split('/')[-1]
    num = pno.findall(filename)
    if len(num) == 2:
        return (int(num[0]), int(num[1]))
    else:
        return (int(num[0]), 0)
 start_time = time.time()
 CAMELOT_DIR = '.camelot/'
 mkdir(CAMELOT_DIR)
-parser = argparse.ArgumentParser(
+if __name__ == '__main__':
-    description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
+    start_time = time.time()
-parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
+    tmpdir = tempfile.mkdtemp()
                    help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
 parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
                    help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
 parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
                    help='Extract tables with ruling lines. (default: False)')
 parser.add_argument('-i', '--fill', action='store', dest='fill',
                    help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
 parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
                    help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
 parser.add_argument('-j', '--jtol', nargs='?', action='store',
                    dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
 parser.add_argument('-t', '--mtol', nargs='?', action='store',
                    dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
 parser.add_argument('-n', '--invert', action='store_true', dest='invert',
                    help='Make sure lines are in foreground. (default: False)')
 parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
                    help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
 parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
                    help='(default: 2.0)', default=2.0, type=float)
 parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
                    help='(default: 0.5)', default=0.5, type=float)
 parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
                    help='(default: 0.1)', default=0.1, type=float)
 parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
                    help='Specify output directory.')
 parser.add_argument('file', nargs=1)
-result = parser.parse_args()
+    args = docopt(doc, version='0.1', options_first=True)
    argv = [args['<method>']] + args['<args>']
    if args['<method>'] == 'lattice':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))
-if result.pages:
+    if args['--pages']:
-    if result.pages == ['all']:
+        if args['--pages'] == ['all']:
-        p = result.pages
+            p = args['--pages']
        else:
            p = []
-        for r in result.pages[0].split(' '):
+            for r in args['--pages'].split(','):
                if '-' in r:
                    a, b = r.split('-')
                    a, b = int(a), int(b)
                    p.extend([str(i) for i in range(a, b + 1)])
                else:
                    p.extend([str(r)])
-else:
+    else:
        p = ['1']
-p = sorted(set(p))
+    p = sorted(set(p))
-filename = result.file[0].split('/')[-1]
+    fname = os.path.basename(args['<file>'])
-# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
+    fname = secure_filename(fname)
-pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
+    fdir = os.path.dirname(args['<file>'])
-mkdir(pdf_dir)
+    froot, fext = os.path.splitext(fname)
-logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
+    if fext.lower() != '.pdf':
-                    0] + '.log'), filemode='w', level=logging.DEBUG)
+        print "camelot can parse only pdfs right now"
        sys.exit()
-shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
+    logfname = os.path.join(tmpdir, froot + '.log')
-print "separating pdf into pages"
+    logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)
-print
+
-if p == ['all']:
+    shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
-    subprocess.call(['pdfseparate', os.path.join(
+    print "separating pdf into pages"
-        pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
+    print
-else:
+    if p == ['all']:
        subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
                        'pg-%d.pdf')])
    else:
        for page in p:
-        subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
+            subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
-            pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
+                            os.path.join(tmpdir, 'pg-%s.pdf' % page)])
-if result.spreadsheet:
+    glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
-    print "using the spreadsheet method"
+    if args['<method>'] == 'lattice':
-    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
+        print "using the lattice method"
-        print "converting", g.split('/')[-1], "to image"
+        for g in glob_pdf:
            g_fname = os.path.basename(g)
            g_froot, __ = os.path.splitext(g)
            print "converting %s to image" % g_fname
            os.system(' '.join(['convert', '-density', '300',
-                            g, '-depth', '8', g[:-4] + '.png']))
+                               g, '-depth', '8', g_froot + '.png']))
            try:
-            spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
+                data = lattice(g, f=args['--fill'], s=int(args['--scale']),
-                        result.jtol, result.mtol, result.invert, result.debug,
+                               jtol=int(args['--jtol']), mtol=int(args['--mtol']),
-                        result.char_margin, result.line_margin, result.word_margin)
+                               invert=args['--invert'], debug=args['--debug'])
-        except:
+                if data is None:
-          logging.error("Couldn't parse " + g.split('/')[-1])
+                    print
-          print "Couldn't parse", g.split('/')[-1]
+                    print "See 'camelot lattice -h' for various parameters you can tweak."
-else:
+                    sys.exit()
-    print "using the basic method"
+                for k in sorted(data.keys()):
-    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
+                    csvfile = g_froot + '_%s.csv' % k
-        basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
+                    with open(csvfile, 'w') as outfile:
                        writer = csv.writer(outfile)
                        for d in data[k]:
                            writer.writerow([c.encode('utf-8') for c in d])
                        print "saved as", os.path.basename(csvfile)
                print
            except Exception:
                logging.exception("")
                print "couldn't parse", g_fname, "see log for more info"
                print
    elif args['<method>'] == 'stream':
        print "using the stream method"
        for g in glob_pdf:
            g_fname = os.path.basename(g)
            g_froot, __ = os.path.splitext(g)
            try:
                data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
                              char_margin=float(args['--cmargin']),
                              line_margin=float(args['--lmargin']),
                              word_margin=float(args['--wmargin']),
                              debug=args['--debug'])
                if data is None:
                    print
                    print "See 'camelot stream -h' for various parameters you can tweak."
                    sys.exit()
                csvfile = g_froot + '.csv'
                with open(csvfile, 'w') as outfile:
                    writer = csv.writer(outfile)
                    for d in data:
                        writer.writerow([c.encode('utf-8') for c in d])
                    print "saved as", os.path.basename(csvfile)
                    print
            except Exception:
                logging.exception("")
                print "couldn't parse", g_fname, "see log for more info"
                print
-if result.format == ['xlsx']:
+    glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
-    import csv
+    if args['--format'] == 'csv':
        if len(glob_csv) == 1:
            if args['--output']:
                shutil.copy(glob_csv[0], args['--output'])
                if args['--log']:
                    shutil.copy(logfname, args['--output'])
            else:
                shutil.copy(glob_csv[0], fdir)
                if args['--log']:
                    shutil.copy(zippath, fdir)
        else:
            zipname = froot + '.zip'
            zippath = os.path.join(tmpdir, zipname)
            print "zipping 'em up"
            with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
                for g in glob_csv:
                    myzip.write(g, os.path.join(froot, os.path.basename(g)))
            if args['--output']:
                shutil.copy(zippath, args['--output'])
                if args['--log']:
                    shutil.copy(logfname, args['--output'])
            else:
                shutil.copy(zippath, fdir)
                if args['--log']:
                    shutil.copy(zippath, fdir)
            print
    elif args['--format'] == 'xlsx':
        from pyexcel_xlsx import save_data
        from collections import OrderedDict
        data = OrderedDict()
-    for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
+        for c in glob_csv:
-        print "adding", c.split('/')[-1], "to excel file"
+            c_fname = os.path.basename(c)
            c_froot, __ = os.path.splitext(c)
            print "adding", c_fname, "to excel file"
            with open(c, 'r') as csvfile:
                reader = csv.reader(csvfile)
-            data.update({c.split('/')[-1].split('.')
+                c_froot, __ = os.path.splitext(c_fname)
-                         [0]: [row for row in reader]})
+                data.update({c_froot: [row for row in reader]})
-    xlsxname = filename.split('.')[0] + '.xlsx'
+        xlsxname = froot + '.xlsx'
-    xlsxpath = os.path.join(pdf_dir, xlsxname)
+        xlsxpath = os.path.join(tmpdir, xlsxname)
        save_data(xlsxpath, data)
        if args['--output']:
            shutil.copy(xlsxpath, args['--output'])
            if args['--log']:
                shutil.copy(logfname, args['--output'])
        else:
            shutil.copy(xlsxpath, fdir)
            if args['--log']:
                shutil.copy(zippath, fdir)
        print
        print "saved as", xlsxname
-print "finished in", time.time() - start_time, "seconds"
+    print "cleaning up..."
-logging.info("Time taken for " + filename + ": " +
+    shutil.rmtree(tmpdir)
    print "finished in", time.time() - start_time, "seconds"
    logging.info("Time taken for " + fname + ": " +
                 str(time.time() - start_time) + " seconds")
--- a/cell.py
+++ b/cell.py
@ -1,6 +1,44 @@
 class Cell:
    """Cell
    Parameters
    ----------
    x1 : int
    y1 : int
    x2 : int
    y2 : int
    Attributes
    ----------
    lb : tuple
    lt : tuple
    rb : tuple
    rt : tuple
    bbox : tuple
    left : bool
    right : bool
    top : bool
    bottom : bool
    text : string
    spanning_h : bool
    spanning_v : bool
    """
    def __init__(self, x1, y1, x2, y2):
        self.lb = (x1, y1)
        self.lt = (x1, y2)
        self.rb = (x2, y1)
@ -15,10 +53,28 @@ class Cell:
        self.spanning_v = False
    def add_text(self, text):
-        self.text += text
+        """Add text to cell object.
        Parameters
        ----------
        text : string
        """
        self.text = ''.join([self.text, text])
    def get_text(self):
        """Get text from cell object.
        Returns
        -------
        text : string
        """
        return self.text
    def get_bounded_edges(self):
        """Get number of edges by which a cell is bounded.
        Returns
        -------
        bounded_edges : int
        """
        return self.top + self.bottom + self.left + self.right
--- a/spreadsheet.py
+++ b/spreadsheet.py
@ -1,37 +1,173 @@
 import os
 import csv
 import cv2
 import glob
 import numpy as np
 from table import Table
 from pdf import get_pdf_info
 from morph_transform import morph_transform
 from utils import (translate, scale, merge_close_values, get_row_idx,
                   get_column_idx, reduce_index, outline, fill, remove_empty)
-def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
+def morph_transform(img, s=15, invert=False):
-                char_margin, line_margin, word_margin):
+    """Morphological Transformation
    Applies a series of morphological operations on the image
    to find table contours and line segments.
    http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
    Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
    taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
    Parameters
    ----------
    img : ndarray
    s : int, default: 15, optional
        Scaling factor. Large scaling factor leads to smaller lines
        being detected.
    invert : bool, default: False, optional
        Invert pdf image to make sure that lines are in foreground.
    Returns
    -------
    tables : dict
        Dictionary with table bounding box as key and list of
        joints found in the table as value.
    v_segments : list
        List of vertical line segments found in the image.
    h_segments : list
        List of horizontal line segments found in the image.
    """
    img_x, img_y = img.shape[1], img.shape[0]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if invert:
        threshold = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
    else:
        threshold = cv2.adaptiveThreshold(np.invert(
            gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
    vertical = threshold
    horizontal = threshold
    scale = s
    verticalsize = vertical.shape[0] / scale
    horizontalsize = horizontal.shape[1] / scale
    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
    vertical = cv2.erode(vertical, ver, (-1, -1))
    vertical = cv2.dilate(vertical, ver, (-1, -1))
    horizontal = cv2.erode(horizontal, hor, (-1, -1))
    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
    __, contours, __ = cv2.findContours(
        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    tables = {}
    for c in contours:
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        # find number of non-zero values in joints using what boundingRect
        # returns
        roi = joints[y : y + h, x : x + w]
        __, jc, __ = cv2.findContours(
            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than <=4 joints
            continue
        joint_coords = []
        for j in jc:
            jx, jy, jw, jh = cv2.boundingRect(j)
            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords
    v_segments, h_segments = [], []
    _, vcontours, _ = cv2.findContours(
        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
    _, hcontours, _ = cv2.findContours(
        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
    return tables, v_segments, h_segments
 def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
    """Lattice algorithm
    Makes table using pdf geometry information returned by
    morph_transform and fills data returned by PDFMiner in table cells.
    Parameters
    ----------
    filepath : string
    f : string, default: None, optional
        Fill data in horizontal and/or vertical spanning
        cells. ('h', 'v', 'hv')
    s : int, default: 15, optional
        Scaling factor. Large scaling factor leads to smaller lines
        being detected.
    jtol : int, default: 2, optional
        Tolerance to account for when comparing joint and line
        coordinates.
    mtol : int, default: 2, optional
        Tolerance to account for when merging lines which are
        very close.
    invert : bool, default: False, optional
        Invert pdf image to make sure that lines are in foreground.
    debug : string
        Debug by visualizing pdf geometry.
        ('contour', 'line', 'joint', 'table')
    Returns
    -------
    output : dict
        Dictionary with table number as key and list of data as value.
    """
    if debug:
        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
+    filename = os.path.basename(filepath)
    print "working on", filename
-    imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
+    fileroot, __ = os.path.splitext(filepath)
    imagename = fileroot + '.png'
    img = cv2.imread(imagename)
    img_x, img_y = img.shape[1], img.shape[0]
-    text, pdf_x, pdf_y = get_pdf_info(
+    text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
        os.path.join(pdf_dir, filename), 'spreadsheet',
        char_margin, line_margin, word_margin)
    scaling_factor_x = pdf_x / float(img_x)
    scaling_factor_y = pdf_y / float(img_y)
-    tables, v_segments, h_segments = morph_transform(imagename, s, invert)
+    tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)
-    if debug == ["contours"]:
+    if debug == "contour":
        for t in tables.keys():
            cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
        plt.imshow(img)
-    if debug == ["joints"]:
+        plt.show()
        return None
    if debug == "joint":
        x_coord = []
        y_coord = []
        for k in tables.keys():
@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
        plt.plot(x_coord, y_coord, 'ro')
        plt.axis([0, max_x + 100, max_y + 100, 0])
        plt.imshow(img)
        plt.show()
        return None
    # detect if vertical
    num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
            abs(translate(-img_y, h[3])), scaling_factor_y)
        h_segments_new.append((x1, y1, x2, y2))
-    num_tables = 0
+    num_tables = 1
    output = {}
    # sort tables based on y-coord
    for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
        # find rows and columns that lie in table
@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
        h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
               < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]
-        if debug == ["lines"]:
+        if debug == "line":
            for v in v_s:
                plt.plot([v[0], v[2]], [v[1], v[3]])
            for h in h_s:
                plt.plot([h[0], h[2]], [h[1], h[3]])
            plt.show()
            return None
        columns, rows = zip(*tables_new[k])
        columns, rows = list(columns), list(rows)
        columns.extend([lb[0], rt[0]])
        rows.extend([lb[1], rt[1]])
        # sort horizontal and vertical segments
-        columns = merge_close_values(sorted(columns), mtol)
+        columns = merge_close_values(sorted(columns), mtol=mtol)
-        rows = merge_close_values(sorted(rows, reverse=True), mtol)
+        rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
        # make grid using x and y coord of shortlisted rows and columns
        columns = [(columns[i], columns[i + 1])
                   for i in range(0, len(columns) - 1)]
@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
        table = Table(columns, rows)
        # light up cell edges
-        table = table.set_edges(v_s, h_s, jtol)
+        table = table.set_edges(v_s, h_s, jtol=jtol)
        # table set span method
        table = table.set_spanning()
-        # TODO
+        # light up table border
        table = outline(table)
-        if debug == ["tables"]:
+        if debug == "table":
            for i in range(len(table.cells)):
                for j in range(len(table.cells[i])):
                    if table.cells[i][j].left:
@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
                    if table.cells[i][j].bottom:
                        plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
                                 [table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
        if debug:
            plt.show()
            return None
        # fill text after sorting it
        if not rotated:
@ -152,8 +293,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
                r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
                table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))
-        if fill:
+        if f is not None:
-            table = fill(table, fill)
+            table = fill(table, f=f)
        data = []
        for i in range(len(table.cells)):
@ -165,13 +306,7 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
            data = zip(*data[::1])
            data.reverse()
        data = remove_empty(data)
-        csvname = filename.split(
+        output['table_%d' % num_tables] = data
            '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
        csvpath = os.path.join(pdf_dir, csvname)
        with open(csvpath, 'w') as outfile:
            writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
            for d in data:
                writer.writerow(d)
            print "saved as", csvname
            print
        num_tables += 1
    return output
--- a/morph_transform.py
+++ b/morph_transform.py
@ -1,75 +0,0 @@
 import cv2
 import numpy as np
 def morph_transform(imagename, s, invert):
    # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
    img = cv2.imread(imagename)
    img_x, img_y = img.shape[1], img.shape[0]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # empirical result taken from
    # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
    if invert:
        threshold = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
    else:
        threshold = cv2.adaptiveThreshold(np.invert(
            gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
    vertical = threshold
    horizontal = threshold
    scale = s
    verticalsize = vertical.shape[0] / scale
    horizontalsize = horizontal.shape[1] / scale
    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
    vertical = cv2.erode(vertical, ver, (-1, -1))
    vertical = cv2.dilate(vertical, ver, (-1, -1))
    horizontal = cv2.erode(horizontal, hor, (-1, -1))
    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
    _, contours, _ = cv2.findContours(
        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    tables = {}
    for c in contours:
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        # find number of non-zero values in joints using what boundingRect
        # returns
        roi = joints[y:y + h, x:x + w]
        _, jc, _ = cv2.findContours(
            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than <=4 joints
            continue
        joint_coords = []
        for j in jc:
            jx, jy, jw, jh = cv2.boundingRect(j)
            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords
    v_segments, h_segments = [], []
    _, vcontours, _ = cv2.findContours(
        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
    _, hcontours, _ = cv2.findContours(
        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
    return tables, v_segments, h_segments
--- a/pdf.py
+++ b/pdf.py
@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal
-def parse_text_basic(layout, t=None):
+def parse_text_stream(layout, t=None):
    """Recursively parse pdf layout to get a list of
    LTTextHorizontal objects.
    Parameters
    ----------
    layout : object
    t : list
    Returns
    -------
    t : list
    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
-            if type(obj) is LTTextLineHorizontal:
+            if isinstance(obj, LTTextLineHorizontal):
                t.append(obj)
            else:
-                t += parse_text_basic(obj)
+                t += parse_text_stream(obj)
    except AttributeError:
        pass
    return t
-def parse_text_spreadsheet(layout, t=None):
+def parse_text_lattice(layout, t=None):
    """Recursively parse pdf layout to get a list of
    LTChar objects.
    Parameters
    ----------
    layout : object
    t : list
    Returns
    -------
    t : list
    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
-            if type(obj) is LTChar:
+            if isinstance(obj, LTChar):
                t.append(obj)
            else:
-                t += parse_text_spreadsheet(obj)
+                t += parse_text_lattice(obj)
    except AttributeError:
        pass
    return t
-def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
+def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
                 word_margin=0.1):
    """Get list of text objects along with pdf width and height.
    Parameters
    ----------
    pdfname : string
    method : string
    char_margin : float
    line_margin : float
    word_margin : float
    Returns
    -------
    text : list
    pdf_x : int
    pdf_y : int
    """
    if not method:
        return None
    with open(pdfname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
-            if method == 'basic':
+            if method == 'stream':
-                text = parse_text_basic(layout)
+                text = parse_text_stream(layout)
-            elif method == 'spreadsheet':
+            elif method == 'lattice':
-                text = parse_text_spreadsheet(layout)
+                text = parse_text_lattice(layout)
            pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
    return text, pdf_x, pdf_y
--- a/stream.py
+++ b/stream.py
@ -0,0 +1,143 @@
 import os
 import numpy as np
 from pdf import get_pdf_info
 def overlap(l):
    """Groups overlapping columns and returns list with updated
    columns boundaries.
    Parameters
    ----------
    l : list
        List of column x-coordinates.
    Returns
    -------
    merged : list
        List of merged column x-coordinates.
    """
    merged = []
    for higher in l:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                lower_bound = min(lower[0], higher[0])
                merged[-1] = (lower_bound, upper_bound)
            else:
                merged.append(higher)
    return merged
 def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
           line_margin=0.5, word_margin=0.1, debug=False):
    """Stream algorithm
    Groups data returned by PDFMiner into rows and finds mode of the
    number of elements in each row to guess number of columns.
    Parameters
    ----------
    filepath : string
    ncolumns : int, default: 0, optional
        Number of columns.
    columns : string, default: None, optional
        Comma-separated list of column x-coordinates.
    char_margin : float, default: 2.0, optional
        Char margin. Chars closer than cmargin are grouped together
        to form a word.
    line_margin : float, default: 0.5, optional
        Line margin. Lines closer than lmargin are grouped together
        to form a textbox.
    word_margin : float, default: 0.1, optional
        Word margin. Insert blank spaces between chars if distance
        between words is greater than word margin.
    debug : bool, default: False, optional
        Debug by visualizing textboxes.
    Returns
    -------
    output : list
    """
    filename = os.path.basename(filepath)
    print "working on", filename
    text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
                                line_margin=line_margin, word_margin=word_margin)
    text.sort(key=lambda x: (-x.y0, x.x0))
    y_last = 0
    data = []
    temp = []
    elements = []
    for t in text:
        # is checking for upright necessary?
        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
        # type(obj) is LTChar]):
        if t.get_text().strip():
            if not np.isclose(y_last, t.y0, atol=2):
                y_last = t.y0
                elements.append(len(temp))
                data.append(temp)
                temp = []
            temp.append(t)
    if debug:
        import matplotlib.pyplot as plt
        import matplotlib.patches as patches
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect='equal')
        xs, ys = [], []
        for d in data:
            for t in d:
                xs.extend([t.x0, t.x1])
                ys.extend([t.y0, t.y1])
                ax.add_patch(
                    patches.Rectangle(
                        (t.x0, t.y0),
                        t.x1 - t.x0,
                        t.y1 - t.y0
                    )
                )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        plt.show()
        return None
    if columns:
        cols = [(float(columns[i]), float(columns[i + 1]))
                for i in range(0, len(columns) - 1)]
        cols = [(c[0] + c[1]) / 2.0 for c in cols]
    else:
        # a table can't have just 1 column, can it?
        elements = filter(lambda x: x != 1, elements)
        mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
        cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
        cols = overlap(sorted(cols))
        cols = [(c[0] + c[1]) / 2.0 for c in cols]
    output = [['' for c in cols] for d in data]
    for row, d in enumerate(data):
        for t in d:
            cog = (t.x0 + t.x1) / 2.0
            diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
            if diff:
                idx = min(diff, key=lambda x: x[1])
            else:
                print "couldn't find a table on this page"
                return None
            if output[row][idx[0]]:
                output[row][idx[0]] += ' ' + t.get_text().strip()
            else:
                output[row][idx[0]] = t.get_text().strip()
    return output
--- a/table.py
+++ b/table.py
@ -4,14 +4,55 @@ from cell import Cell
 class Table:
    """Table
    Parameters
    ----------
    columns : list
        List of column x-coordinates.
    rows : list
        List of row y-coordinates.
    Attributes
    ----------
    cells : list
        2-D list of cell objects.
    columns : list
        List of column x-coordinates.
    rows : list
        List of row y-coordinates.
    """
    def __init__(self, columns, rows):
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in columns] for r in rows]
        self.columns = columns
        self.rows = rows
-    def set_edges(self, vertical, horizontal, jtol):
+    def set_edges(self, vertical, horizontal, jtol=2):
        """Set cell edges to True if corresponding line segments
        are detected in the pdf image.
        Parameters
        ----------
        vertical : list
            List of vertical line segments.
        horizontal : list
            List of horizontal line segments.
        jtol : int, default: 2, optional
            Tolerance to account for when comparing joint and line
            coordinates.
        Returns
        -------
        self : object
            Returns self.
        """
        for v in vertical:
            # find closest x coord
            # iterate over y coords and find closest points
@ -117,6 +158,14 @@ class Table:
        return self
    def set_spanning(self):
        """Set spanning values of a cell to True if it isn't
        bounded by four edges.
        Returns
        -------
        self : object
            Returns self.
        """
        for i in range(len(self.cells)):
            for j in range(len(self.cells[i])):
                bound = self.cells[i][j].get_bounded_edges()
@ -125,28 +174,38 @@ class Table:
                elif bound == 3:
                    if not self.cells[i][j].left:
-                        if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
+                        if (self.cells[i][j].right and
                                self.cells[i][j].top and 
                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_h = True
                    elif not self.cells[i][j].right:
-                        if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
+                        if (self.cells[i][j].left and
                                self.cells[i][j].top and
                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_h = True
                    elif not self.cells[i][j].top:
-                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
+                        if (self.cells[i][j].left and
                                self.cells[i][j].right and
                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_v = True
                    elif not self.cells[i][j].bottom:
-                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
+                        if (self.cells[i][j].left and
                                self.cells[i][j].right and
                                self.cells[i][j].top):
                            self.cells[i][j].spanning_v = True
                elif bound == 2:
                    if self.cells[i][j].left and self.cells[i][j].right:
-                        if not self.cells[i][j].top and not self.cells[i][j].bottom:
+                        if (not self.cells[i][j].top and
                                not self.cells[i][j].bottom):
                            self.cells[i][j].spanning_v = True
                    elif self.cells[i][j].top and self.cells[i][j].bottom:
-                        if not self.cells[i][j].left and not self.cells[i][j].right:
+                        if (not self.cells[i][j].left and
                                not self.cells[i][j].right):
                            self.cells[i][j].spanning_h = True
        return self
--- a/utils.py
+++ b/utils.py
@ -2,16 +2,61 @@ import numpy as np
 def translate(x1, x2):
    """Translate coordinate x2 by x1.
    Parameters
    ----------
    x1 : float
    x2 : float
    Returns
    -------
    x2 : float
    """
    x2 += x1
    return x2
 def scale(x, s):
    """Scale coordinate x by scaling factor s.
    Parameters
    ----------
    x : float
    s : float
    Returns
    -------
    x : float
    """
    x *= s
    return x
 def rotate(x1, y1, x2, y2, angle):
    """Rotate point x2, y2 about point x1, y1 by angle.
    Parameters
    ----------
    x1 : float
    y1 : float
    x2 : float
    y2 : float
    angle : float
        Angle in radians.
    Returns
    -------
    xnew : float
    ynew : float
    """
    s = np.sin(angle)
    c = np.cos(angle)
    x2 = translate(-x1, x2)
@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
    return xnew, ynew
-def remove_close_values(ar, mtol):
+def remove_close_values(ar, mtol=2):
    """Remove values which are within a tolerance of mtol of another value
    present in list.
    Parameters
    ----------
    ar : list
    mtol : int, default: 2, optional
    Returns
    -------
    ret : list
    """
    ret = []
    for a in ar:
        if not ret:
@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
    return ret
-def merge_close_values(ar, mtol):
+def merge_close_values(ar, mtol=2):
    """Merge values which are within a tolerance of mtol by calculating
    a moving mean.
    Parameters
    ----------
    ar : list
    mtol : int, default: 2, optional
    Returns
    -------
    ret : list
    """
    ret = []
    for a in ar:
        if not ret:
@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):
 def get_row_idx(t, rows):
    """Get index of the row in which the given object falls by
    comparing their co-ordinates.
    Parameters
    ----------
    t : object
    rows : list
    Returns
    -------
    r : int
    """
    for r in range(len(rows)):
        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
            return r
 def get_column_idx(t, columns):
    """Get index of the column in which the given object falls by
    comparing their co-ordinates.
    Parameters
    ----------
    t : object
    columns : list
    Returns
    -------
    c : int
    """
    for c in range(len(columns)):
        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
            return c
 def reduce_index(t, rotated, r_idx, c_idx):
    """Shift a text object if it lies within a spanning cell taking
    in account table rotation.
    Parameters
    ----------
    t : object
    rotated : string
    r_idx : int
    c_idx : int
    Returns
    -------
    r_idx : int
    c_idx : int
    """
    if not rotated:
        if t.cells[r_idx][c_idx].spanning_h:
            while not t.cells[r_idx][c_idx].left:
@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):
 def outline(t):
    """Light up table boundary.
    Parameters
    ----------
    t : object
    Returns
    -------
    t : object
    """
    for i in range(len(t.cells)):
        t.cells[i][0].left = True
        t.cells[i][len(t.cells[i]) - 1].right = True
@ -99,7 +225,19 @@ def outline(t):
    return t
-def fill(t, f):
+def fill(t, f=None):
    """Fill spanning cells.
    Parameters
    ----------
    t : object
    f : string, default: None, optional
    Returns
    -------
    t : object
    """
    if f == "h":
        for i in range(len(t.cells)):
            for j in range(len(t.cells[i])):
@ -124,6 +262,16 @@ def fill(t, f):
 def remove_empty(d):
    """Remove empty rows and columns.
    Parameters
    ----------
    d : list
    Returns
    -------
    d : list
    """
    for i, row in enumerate(d):
        if row == [''] * len(row):
            d.pop(i)