Modify command line tool

Precompute globs Replace argparse with docopt Fix CLI Update .gitignore Add docstrings Update README Fix typo Replace zip subprocess call Use tempfile Fix newline
2016-07-19 16:45:28 +05:30 · 2016-07-19 16:45:28 +05:30 · 271d4cafd6
parent 3045a92630
commit 271d4cafd6
11 changed files with 895 additions and 373 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 __pycache__/
 *.py[cod]
-.camelot/
+*.so
+
+.camelot/
--- a/README.md
+++ b/README.md
@ -14,63 +14,24 @@ camelot also uses poppler-utils, more specifically `pdfseparate` to separate a p

 ## Usage

-python2 camelot.py [options] file
+<pre>
+camelot.py [options] <method> [<args>...]

-positional arguments:
+options:
+ -h, --help                Show this screen.
+ -v, --version             Show version.
+ -p, --pages &lt;pageno&gt;      Comma-separated list of page numbers.
+                           Example: -p 1,3-6,10  [default: 1]
+ -f, --format &lt;format&gt;     Output format. (csv,xlsx) [default: csv]
+ -l, --log                 Print log to file.
+ -o, --output &lt;directory&gt;  Output directory.

-  file
+camelot methods:
+ lattice  Looks for lines between data.
+ stream   Looks for spaces between data.

-optional arguments:
-
-  -h, --help
-
-    show this help message and exit
-
-  -p, --pages PAGES [PAGES ...]
-
-    Specify the page numbers and/or page ranges to be
-    parsed. Example: -p="1 3-5 9", -p="all" (default: 1)
-
-  -f, --format FORMAT
-
-    Output format (csv/xlsx). Example: -f="xlsx" (default: csv)
-
-  -m, --spreadsheet
-
-    Extract tables with ruling lines. (default: False)
-
-  -F, --fill FILL
-
-    Fill the values in empty cells horizontally(h) and/or
-    vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)
-
-  -s, --scale [SCALE]
-
-    Scaling factor. Large scaling factor leads to smaller
-    lines being detected. (default: 15)
-
-  -j, --jtol [JTOL]
-
-    Tolerance to account for when comparing joint and line
-    coordinates. (default: 2)
-
-  -M, --mtol [MTOL]
-
-    Tolerance to account for when merging lines which are
-    very close. (default: 2)
-
-  -i, --invert
-
-    Make sure lines are in foreground. (default: False)
-
-  -d, --debug DEBUG
-
-    Debug by visualizing contours, lines, joints, tables.
-    Example: --debug="contours"
-
-  -o, --output OUTPUT
-
-    Specify output directory.
+See 'camelot <method> -h' for more information on a specific method.
+</pre>

 ## Development

--- a/basic.py
+++ b/basic.py
@ -1,80 +0,0 @@
-import os
-import csv
-import numpy as np
-
-from pdf import get_pdf_info
-
-
-def overlap(l):
-    merged = []
-    for higher in l:
-        if not merged:
-            merged.append(higher)
-        else:
-            lower = merged[-1]
-            if higher[0] <= lower[1]:
-                upper_bound = max(lower[1], higher[1])
-                lower_bound = min(lower[0], higher[0])
-                merged[-1] = (lower_bound, upper_bound)
-            else:
-                merged.append(higher)
-    return merged
-
-
-def get_row_idx(t, rows):
-    for r in range(len(rows)):
-        if t.y1 <= rows[r][0] and t.y0 >= rows[r][1]:
-            return r
-
-
-def get_column_idx(t, columns):
-    for c in range(len(columns)):
-        if t.x0 >= columns[c][0] and t.x1 <= columns[c][1]:
-            return c
-
-
-def basic(pdf_dir, filename, char_margin, line_margin, word_margin):
-    print "working on", filename
-    text, _, _ = get_pdf_info(os.path.join(pdf_dir, filename), 'basic',
-                              char_margin, line_margin, word_margin)
-    text.sort(key=lambda x: (-x.y0, x.x0))
-    y_last = 0
-    data = []
-    temp = []
-    elements = []
-    for t in text:
-        # is checking for upright necessary?
-        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
-        # type(obj) is LTChar]):
-        if t.get_text().strip():
-            if not np.isclose(y_last, t.y0, atol=2):
-                y_last = t.y0
-                elements.append(len(temp))
-                data.append(temp)
-                temp = []
-            temp.append(t)
-    # a table can't have just 1 column, can it?
-    elements = filter(lambda x: x != 1, elements)
-    # mode = int(sys.argv[2]) if sys.argv[2] else max(set(elements), key=elements.count)
-    mode = max(set(elements), key=elements.count)
-    columns = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
-    columns = overlap(sorted(columns))
-    columns = [(c[0] + c[1]) / 2.0 for c in columns]
-
-    output = [['' for c in columns] for d in data]
-    for row, d in enumerate(data):
-        for t in d:
-            cog = (t.x0 + t.x1) / 2.0
-            diff = [(i, abs(cog - c)) for i, c in enumerate(columns)]
-            idx = min(diff, key=lambda x: x[1])
-            if output[row][idx[0]]:
-                output[row][idx[0]] += ' ' + t.get_text().strip()
-            else:
-                output[row][idx[0]] = t.get_text().strip()
-
-    csvname = filename.split('.')[0] + '.csv'
-    csvpath = os.path.join(pdf_dir, csvname)
-    with open(csvpath, 'w') as outfile:
-        writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
-        for row in output:
-            writer.writerow([cell.encode('utf-8') for cell in row])
--- a/camelot.py
+++ b/camelot.py
@ -1,136 +1,258 @@
+#!/usr/bin/env python2
 import os
 import re
+import csv
+import sys
 import glob
 import time
 import shutil
 import logging
+import zipfile
+import tempfile
 import subprocess
-import argparse
+from docopt import docopt
+from werkzeug.utils import secure_filename

-from basic import basic
-from spreadsheet import spreadsheet
+from lattice import lattice
+from stream import stream
+
+
+doc = """
+camelot parses tables from PDFs!
+
+usage:
+ camelot.py [options] <method> [<args>...]
+
+options:
+ -h, --help                Show this screen.
+ -v, --version             Show version.
+ -p, --pages <pageno>      Comma-separated list of page numbers.
+                           Example: -p 1,3-6,10  [default: 1]
+ -f, --format <format>     Output format. (csv,xlsx) [default: csv]
+ -l, --log                 Print log to file.
+ -o, --output <directory>  Output directory.
+
+camelot methods:
+ lattice  Looks for lines between data.
+ stream   Looks for spaces between data.
+
+See 'camelot <method> -h' for more information on a specific method.
+"""
+
+lattice_doc = """
+Lattice method looks for lines between data to form a table.
+
+usage:
+ camelot.py lattice [options] [--] <file>
+
+options:
+ -F, --fill <fill>      Fill data in horizontal and/or vertical spanning
+                        cells. Example: -F h, -F v, -F hv
+ -s, --scale <scale>    Scaling factor. Large scaling factor leads to
+                        smaller lines being detected. [default: 15]
+ -j, --jtol <jtol>      Tolerance to account for when comparing joint
+                        and line coordinates. [default: 2]
+ -m, --mtol <mtol>      Tolerance to account for when merging lines
+                        which are very close. [default: 2]
+ -i, --invert           Invert pdf image to make sure that lines are
+                        in foreground.
+ -d, --debug <debug>    Debug by visualizing pdf geometry.
+                        (contour,line,joint,table) Example: -d table
+"""
+
+stream_doc = """
+Stream method looks for spaces between data to form a table.
+
+usage:
+ camelot.py stream [options] [--] <file>
+
+options:
+ -n, --ncols <ncols>      Number of columns. [default: 0]
+ -c, --columns <columns>  Comma-separated list of column x-coordinates.
+                          Example: -c 10.1,20.2,30.3
+ -M, --cmargin <cmargin>  Char margin. Chars closer than cmargin are
+                          grouped together to form a word. [default: 2.0]
+ -L, --lmargin <lmargin>  Line margin. Lines closer than lmargin are
+                          grouped together to form a textbox. [default: 0.5]
+ -W, --wmargin <wmargin>  Word margin. Insert blank spaces between chars
+                          if distance between words is greater than word
+                          margin. [default: 0.1]
+ -d, --debug              Debug by visualizing textboxes.
+"""

 pno = re.compile(r'\d+')


-def mkdir(directory):
-    if not os.path.isdir(directory):
-        os.makedirs(directory)
-
-
-def filesort(filename):
-    filename = filename.split('/')[-1]
+def filesort(filepath):
+    filename = os.path.basename(filepath)
    num = pno.findall(filename)
    if len(num) == 2:
        return (int(num[0]), int(num[1]))
    else:
        return (int(num[0]), 0)

-start_time = time.time()
-CAMELOT_DIR = '.camelot/'
-mkdir(CAMELOT_DIR)

-parser = argparse.ArgumentParser(
-    description='Parse tables from pdfs!', usage='python2 camelot.py [options] file')
-parser.add_argument('-p', '--pages', nargs='+', action='store', dest='pages',
-                    help='Specify the page numbers and/or page ranges to be parsed. Example: -p="1 3-5 9", -p="all" (default: 1)')
-parser.add_argument('-f', '--format', nargs=1, action='store', dest='format',
-                    help='Output format (csv/xlsx). Example: -f="xlsx" (default: csv)', default=["csv"])
-parser.add_argument('-s', '--spreadsheet', action='store_true', dest='spreadsheet',
-                    help='Extract tables with ruling lines. (default: False)')
-parser.add_argument('-i', '--fill', action='store', dest='fill',
-                    help='Fill the values in empty cells horizontally(h) and/or vertically(v). Example: -F="h", -F="v", -F="hv" (default: None)', default=None)
-parser.add_argument('-c', '--scale', nargs='?', action='store', dest='scale',
-                    help='Scaling factor. Large scaling factor leads to smaller lines being detected. (default: 15)', default=15, type=int)
-parser.add_argument('-j', '--jtol', nargs='?', action='store',
-                    dest='jtol', help='Tolerance to account for when comparing joint and line coordinates. (default: 2)', default=2, type=int)
-parser.add_argument('-t', '--mtol', nargs='?', action='store',
-                    dest='mtol', help='Tolerance to account for when merging lines which are very close. (default: 2)', default=2, type=int)
-parser.add_argument('-n', '--invert', action='store_true', dest='invert',
-                    help='Make sure lines are in foreground. (default: False)')
-parser.add_argument('-d', '--debug', nargs=1, action='store', dest='debug',
-                    help='Debug by visualizing contours, lines, joints, tables. Example: --debug="contours"')
-parser.add_argument('-M', '--char-margin', nargs='?', action='store', dest='char_margin',
-                    help='(default: 2.0)', default=2.0, type=float)
-parser.add_argument('-L', '--line-margin', nargs='?', action='store', dest='line_margin',
-                    help='(default: 0.5)', default=0.5, type=float)
-parser.add_argument('-W', '--word-margin', nargs='?', action='store', dest='word_margin',
-                    help='(default: 0.1)', default=0.1, type=float)
-parser.add_argument('-o', '--output', nargs=1, action='store', dest='output',
-                    help='Specify output directory.')
-parser.add_argument('file', nargs=1)
+if __name__ == '__main__':
+    start_time = time.time()
+    tmpdir = tempfile.mkdtemp()

-result = parser.parse_args()
+    args = docopt(doc, version='0.1', options_first=True)
+    argv = [args['<method>']] + args['<args>']
+    if args['<method>'] == 'lattice':
+        args.update(docopt(lattice_doc, argv=argv))
+    elif args['<method>'] == 'stream':
+        args.update(docopt(stream_doc, argv=argv))

-if result.pages:
-    if result.pages == ['all']:
-        p = result.pages
+    if args['--pages']:
+        if args['--pages'] == ['all']:
+            p = args['--pages']
+        else:
+            p = []
+            for r in args['--pages'].split(','):
+                if '-' in r:
+                    a, b = r.split('-')
+                    a, b = int(a), int(b)
+                    p.extend([str(i) for i in range(a, b + 1)])
+                else:
+                    p.extend([str(r)])
    else:
-        p = []
-        for r in result.pages[0].split(' '):
-            if '-' in r:
-                a, b = r.split('-')
-                a, b = int(a), int(b)
-                p.extend([str(i) for i in range(a, b + 1)])
-            else:
-                p.extend([str(r)])
-else:
-    p = ['1']
-p = sorted(set(p))
+        p = ['1']
+    p = sorted(set(p))

-filename = result.file[0].split('/')[-1]
-# pdf_dir = os.path.join(CAMELOT_DIR, os.urandom(16).encode('hex'))
-pdf_dir = os.path.join(CAMELOT_DIR, filename.split('.')[0])
-mkdir(pdf_dir)
-logging.basicConfig(filename=os.path.join(pdf_dir, filename.split('.')[
-                    0] + '.log'), filemode='w', level=logging.DEBUG)
+    fname = os.path.basename(args['<file>'])
+    fname = secure_filename(fname)
+    fdir = os.path.dirname(args['<file>'])
+    froot, fext = os.path.splitext(fname)
+    if fext.lower() != '.pdf':
+        print "camelot can parse only pdfs right now"
+        sys.exit()

-shutil.copy(result.file[0], os.path.join(pdf_dir, filename))
-print "separating pdf into pages"
-print
-if p == ['all']:
-    subprocess.call(['pdfseparate', os.path.join(
-        pdf_dir, filename), os.path.join(pdf_dir, 'pg-%d.pdf')])
-else:
-    for page in p:
-        subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(
-            pdf_dir, filename), os.path.join(pdf_dir, 'pg-' + page + '.pdf')])
+    logfname = os.path.join(tmpdir, froot + '.log')
+    logging.basicConfig(filename=logfname, filemode='w', level=logging.DEBUG)

-if result.spreadsheet:
-    print "using the spreadsheet method"
-    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-        print "converting", g.split('/')[-1], "to image"
-        os.system(' '.join(['convert', '-density', '300',
-                            g, '-depth', '8', g[:-4] + '.png']))
-        try:
-            spreadsheet(pdf_dir, g.split('/')[-1], result.fill, result.scale,
-                        result.jtol, result.mtol, result.invert, result.debug,
-                        result.char_margin, result.line_margin, result.word_margin)
-        except:
-          logging.error("Couldn't parse " + g.split('/')[-1])
-          print "Couldn't parse", g.split('/')[-1]
-else:
-    print "using the basic method"
-    for g in sorted(glob.glob(os.path.join(pdf_dir, 'pg-*.pdf'))):
-        basic(pdf_dir, g.split('/')[-1], result.char_margin, result.line_margin, result.word_margin)
-
-if result.format == ['xlsx']:
-    import csv
-    from pyexcel_xlsx import save_data
-    from collections import OrderedDict
-    data = OrderedDict()
-    for c in sorted(glob.glob(os.path.join(pdf_dir, '*.csv')), key=filesort):
-        print "adding", c.split('/')[-1], "to excel file"
-        with open(c, 'r') as csvfile:
-            reader = csv.reader(csvfile)
-            data.update({c.split('/')[-1].split('.')
-                         [0]: [row for row in reader]})
-    xlsxname = filename.split('.')[0] + '.xlsx'
-    xlsxpath = os.path.join(pdf_dir, xlsxname)
-    save_data(xlsxpath, data)
+    shutil.copy(args['<file>'], os.path.join(tmpdir, fname))
+    print "separating pdf into pages"
    print
-    print "saved as", xlsxname
+    if p == ['all']:
+        subprocess.call(['pdfseparate', os.path.join(tmpdir, fname), os.path.join(tmpdir,
+                        'pg-%d.pdf')])
+    else:
+        for page in p:
+            subprocess.call(['pdfseparate', '-f', page, '-l', page, os.path.join(tmpdir, fname),
+                            os.path.join(tmpdir, 'pg-%s.pdf' % page)])

-print "finished in", time.time() - start_time, "seconds"
-logging.info("Time taken for " + filename + ": " +
-             str(time.time() - start_time) + " seconds")
+    glob_pdf = sorted(glob.glob(os.path.join(tmpdir, 'pg-*.pdf')))
+    if args['<method>'] == 'lattice':
+        print "using the lattice method"
+        for g in glob_pdf:
+            g_fname = os.path.basename(g)
+            g_froot, __ = os.path.splitext(g)
+            print "converting %s to image" % g_fname
+            os.system(' '.join(['convert', '-density', '300',
+                               g, '-depth', '8', g_froot + '.png']))
+            try:
+                data = lattice(g, f=args['--fill'], s=int(args['--scale']),
+                               jtol=int(args['--jtol']), mtol=int(args['--mtol']),
+                               invert=args['--invert'], debug=args['--debug'])
+                if data is None:
+                    print
+                    print "See 'camelot lattice -h' for various parameters you can tweak."
+                    sys.exit()
+                for k in sorted(data.keys()):
+                    csvfile = g_froot + '_%s.csv' % k
+                    with open(csvfile, 'w') as outfile:
+                        writer = csv.writer(outfile)
+                        for d in data[k]:
+                            writer.writerow([c.encode('utf-8') for c in d])
+                        print "saved as", os.path.basename(csvfile)
+                print
+            except Exception:
+                logging.exception("")
+                print "couldn't parse", g_fname, "see log for more info"
+                print
+    elif args['<method>'] == 'stream':
+        print "using the stream method"
+        for g in glob_pdf:
+            g_fname = os.path.basename(g)
+            g_froot, __ = os.path.splitext(g)
+            try:
+                data = stream(g, ncolumns=int(args['--ncols']), columns=args['--columns'],
+                              char_margin=float(args['--cmargin']),
+                              line_margin=float(args['--lmargin']),
+                              word_margin=float(args['--wmargin']),
+                              debug=args['--debug'])
+                if data is None:
+                    print
+                    print "See 'camelot stream -h' for various parameters you can tweak."
+                    sys.exit()
+                csvfile = g_froot + '.csv'
+                with open(csvfile, 'w') as outfile:
+                    writer = csv.writer(outfile)
+                    for d in data:
+                        writer.writerow([c.encode('utf-8') for c in d])
+                    print "saved as", os.path.basename(csvfile)
+                    print
+            except Exception:
+                logging.exception("")
+                print "couldn't parse", g_fname, "see log for more info"
+                print
+
+    glob_csv = sorted(glob.glob(os.path.join(tmpdir, '*.csv')), key=filesort)
+    if args['--format'] == 'csv':
+        if len(glob_csv) == 1:
+            if args['--output']:
+                shutil.copy(glob_csv[0], args['--output'])
+                if args['--log']:
+                    shutil.copy(logfname, args['--output'])
+            else:
+                shutil.copy(glob_csv[0], fdir)
+                if args['--log']:
+                    shutil.copy(zippath, fdir)
+        else:
+            zipname = froot + '.zip'
+            zippath = os.path.join(tmpdir, zipname)
+            print "zipping 'em up"
+            with zipfile.ZipFile(zippath, 'a', zipfile.ZIP_DEFLATED) as myzip:
+                for g in glob_csv:
+                    myzip.write(g, os.path.join(froot, os.path.basename(g)))
+            if args['--output']:
+                shutil.copy(zippath, args['--output'])
+                if args['--log']:
+                    shutil.copy(logfname, args['--output'])
+            else:
+                shutil.copy(zippath, fdir)
+                if args['--log']:
+                    shutil.copy(zippath, fdir)
+            print
+    elif args['--format'] == 'xlsx':
+        from pyexcel_xlsx import save_data
+        from collections import OrderedDict
+        data = OrderedDict()
+        for c in glob_csv:
+            c_fname = os.path.basename(c)
+            c_froot, __ = os.path.splitext(c)
+            print "adding", c_fname, "to excel file"
+            with open(c, 'r') as csvfile:
+                reader = csv.reader(csvfile)
+                c_froot, __ = os.path.splitext(c_fname)
+                data.update({c_froot: [row for row in reader]})
+        xlsxname = froot + '.xlsx'
+        xlsxpath = os.path.join(tmpdir, xlsxname)
+        save_data(xlsxpath, data)
+        if args['--output']:
+            shutil.copy(xlsxpath, args['--output'])
+            if args['--log']:
+                shutil.copy(logfname, args['--output'])
+        else:
+            shutil.copy(xlsxpath, fdir)
+            if args['--log']:
+                shutil.copy(zippath, fdir)
+        print
+        print "saved as", xlsxname
+
+    print "cleaning up..."
+    shutil.rmtree(tmpdir)
+
+    print "finished in", time.time() - start_time, "seconds"
+    logging.info("Time taken for " + fname + ": " +
+                 str(time.time() - start_time) + " seconds")
--- a/cell.py
+++ b/cell.py
@ -1,6 +1,44 @@
 class Cell:
+    """Cell

+    Parameters
+    ----------
+    x1 : int
+
+    y1 : int
+
+    x2 : int
+
+    y2 : int
+
+    Attributes
+    ----------
+    lb : tuple
+
+    lt : tuple
+
+    rb : tuple
+
+    rt : tuple
+
+    bbox : tuple
+
+    left : bool
+
+    right : bool
+
+    top : bool
+
+    bottom : bool
+
+    text : string
+
+    spanning_h : bool
+
+    spanning_v : bool
+    """
    def __init__(self, x1, y1, x2, y2):
+    
        self.lb = (x1, y1)
        self.lt = (x1, y2)
        self.rb = (x2, y1)
@ -15,10 +53,28 @@ class Cell:
        self.spanning_v = False

    def add_text(self, text):
-        self.text += text
+        """Add text to cell object.
+
+        Parameters
+        ----------
+        text : string
+        """
+        self.text = ''.join([self.text, text])

    def get_text(self):
+        """Get text from cell object.
+        
+        Returns
+        -------
+        text : string
+        """
        return self.text

    def get_bounded_edges(self):
+        """Get number of edges by which a cell is bounded.
+
+        Returns
+        -------
+        bounded_edges : int
+        """
        return self.top + self.bottom + self.left + self.right
--- a/spreadsheet.py
+++ b/spreadsheet.py
@ -1,37 +1,173 @@
 import os
-import csv
 import cv2
 import glob
 import numpy as np

 from table import Table
 from pdf import get_pdf_info
-from morph_transform import morph_transform
 from utils import (translate, scale, merge_close_values, get_row_idx,
                   get_column_idx, reduce_index, outline, fill, remove_empty)


-def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
-                char_margin, line_margin, word_margin):
+def morph_transform(img, s=15, invert=False):
+    """Morphological Transformation
+
+    Applies a series of morphological operations on the image
+    to find table contours and line segments.
+    http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
+    
+    Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
+    taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
+
+    Parameters
+    ----------
+    img : ndarray
+
+    s : int, default: 15, optional
+        Scaling factor. Large scaling factor leads to smaller lines
+        being detected.
+
+    invert : bool, default: False, optional
+        Invert pdf image to make sure that lines are in foreground.
+
+    Returns
+    -------
+    tables : dict
+        Dictionary with table bounding box as key and list of
+        joints found in the table as value.
+
+    v_segments : list
+        List of vertical line segments found in the image.
+
+    h_segments : list
+        List of horizontal line segments found in the image.
+    """
+    img_x, img_y = img.shape[1], img.shape[0]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    if invert:
+        threshold = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+    else:
+        threshold = cv2.adaptiveThreshold(np.invert(
+            gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
+    vertical = threshold
+    horizontal = threshold
+
+    scale = s
+    verticalsize = vertical.shape[0] / scale
+    horizontalsize = horizontal.shape[1] / scale
+
+    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
+    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
+
+    vertical = cv2.erode(vertical, ver, (-1, -1))
+    vertical = cv2.dilate(vertical, ver, (-1, -1))
+
+    horizontal = cv2.erode(horizontal, hor, (-1, -1))
+    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
+
+    mask = vertical + horizontal
+    joints = np.bitwise_and(vertical, horizontal)
+    __, contours, __ = cv2.findContours(
+        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
+
+    tables = {}
+    for c in contours:
+        c_poly = cv2.approxPolyDP(c, 3, True)
+        x, y, w, h = cv2.boundingRect(c_poly)
+        # find number of non-zero values in joints using what boundingRect
+        # returns
+        roi = joints[y : y + h, x : x + w]
+        __, jc, __ = cv2.findContours(
+            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        if len(jc) <= 4:  # remove contours with less than <=4 joints
+            continue
+        joint_coords = []
+        for j in jc:
+            jx, jy, jw, jh = cv2.boundingRect(j)
+            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
+            joint_coords.append((c1, c2))
+        tables[(x, y + h, x + w, y)] = joint_coords
+
+    v_segments, h_segments = [], []
+    _, vcontours, _ = cv2.findContours(
+        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for vc in vcontours:
+        x, y, w, h = cv2.boundingRect(vc)
+        x1, x2 = x, x + w
+        y1, y2 = y, y + h
+        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
+
+    _, hcontours, _ = cv2.findContours(
+        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for hc in hcontours:
+        x, y, w, h = cv2.boundingRect(hc)
+        x1, x2 = x, x + w
+        y1, y2 = y, y + h
+        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
+
+    return tables, v_segments, h_segments
+
+
+def lattice(filepath, f=None, s=15, jtol=2, mtol=2, invert=False, debug=None):
+    """Lattice algorithm
+
+    Makes table using pdf geometry information returned by
+    morph_transform and fills data returned by PDFMiner in table cells.
+
+    Parameters
+    ----------
+    filepath : string
+
+    f : string, default: None, optional
+        Fill data in horizontal and/or vertical spanning
+        cells. ('h', 'v', 'hv')
+
+    s : int, default: 15, optional
+        Scaling factor. Large scaling factor leads to smaller lines
+        being detected.
+
+    jtol : int, default: 2, optional
+        Tolerance to account for when comparing joint and line
+        coordinates.
+
+    mtol : int, default: 2, optional
+        Tolerance to account for when merging lines which are
+        very close.
+
+    invert : bool, default: False, optional
+        Invert pdf image to make sure that lines are in foreground.
+
+    debug : string
+        Debug by visualizing pdf geometry.
+        ('contour', 'line', 'joint', 'table')
+    Returns
+    -------
+    output : dict
+        Dictionary with table number as key and list of data as value.
+    """
    if debug:
        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
+    filename = os.path.basename(filepath)
    print "working on", filename
-    imagename = os.path.join(pdf_dir, filename.split('.')[0] + '.png')
+    fileroot, __ = os.path.splitext(filepath)
+    imagename = fileroot + '.png'
    img = cv2.imread(imagename)
    img_x, img_y = img.shape[1], img.shape[0]
-    text, pdf_x, pdf_y = get_pdf_info(
-        os.path.join(pdf_dir, filename), 'spreadsheet',
-        char_margin, line_margin, word_margin)
+    text, pdf_x, pdf_y = get_pdf_info(filepath, method='lattice')
    scaling_factor_x = pdf_x / float(img_x)
    scaling_factor_y = pdf_y / float(img_y)
-    tables, v_segments, h_segments = morph_transform(imagename, s, invert)
+    tables, v_segments, h_segments = morph_transform(img, s=s, invert=invert)

-    if debug == ["contours"]:
+    if debug == "contour":
        for t in tables.keys():
            cv2.rectangle(img, (t[0], t[1]), (t[2], t[3]), (255, 0, 0), 3)
        plt.imshow(img)
-    if debug == ["joints"]:
+        plt.show()
+        return None
+    if debug == "joint":
        x_coord = []
        y_coord = []
        for k in tables.keys():
@ -42,6 +178,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
        plt.plot(x_coord, y_coord, 'ro')
        plt.axis([0, max_x + 100, max_y + 100, 0])
        plt.imshow(img)
+        plt.show()
+        return None

    # detect if vertical
    num_v = [t for t in text if (not t.upright) and t.get_text().strip()]
@ -80,7 +218,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
            abs(translate(-img_y, h[3])), scaling_factor_y)
        h_segments_new.append((x1, y1, x2, y2))

-    num_tables = 0
+    num_tables = 1
+    output = {}
    # sort tables based on y-coord
    for k in sorted(tables_new.keys(), key=lambda x: x[1], reverse=True):
        # find rows and columns that lie in table
@ -91,19 +230,21 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
        h_s = [h for h in h_segments_new if h[0] > lb[0] - 2 and h[2]
               < rt[0] + 2 and lb[1] - 2 <= h[1] <= rt[1] + 2]

-        if debug == ["lines"]:
+        if debug == "line":
            for v in v_s:
                plt.plot([v[0], v[2]], [v[1], v[3]])
            for h in h_s:
                plt.plot([h[0], h[2]], [h[1], h[3]])
+            plt.show()
+            return None

        columns, rows = zip(*tables_new[k])
        columns, rows = list(columns), list(rows)
        columns.extend([lb[0], rt[0]])
        rows.extend([lb[1], rt[1]])
        # sort horizontal and vertical segments
-        columns = merge_close_values(sorted(columns), mtol)
-        rows = merge_close_values(sorted(rows, reverse=True), mtol)
+        columns = merge_close_values(sorted(columns), mtol=mtol)
+        rows = merge_close_values(sorted(rows, reverse=True), mtol=mtol)
        # make grid using x and y coord of shortlisted rows and columns
        columns = [(columns[i], columns[i + 1])
                   for i in range(0, len(columns) - 1)]
@ -111,13 +252,13 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,

        table = Table(columns, rows)
        # light up cell edges
-        table = table.set_edges(v_s, h_s, jtol)
+        table = table.set_edges(v_s, h_s, jtol=jtol)
        # table set span method
        table = table.set_spanning()
-        # TODO
+        # light up table border
        table = outline(table)

-        if debug == ["tables"]:
+        if debug == "table":
            for i in range(len(table.cells)):
                for j in range(len(table.cells[i])):
                    if table.cells[i][j].left:
@ -132,8 +273,8 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
                    if table.cells[i][j].bottom:
                        plt.plot([table.cells[i][j].lb[0], table.cells[i][j].rb[0]],
                                 [table.cells[i][j].lb[1], table.cells[i][j].rb[1]])
-        if debug:
            plt.show()
+            return None

        # fill text after sorting it
        if not rotated:
@ -152,26 +293,20 @@ def spreadsheet(pdf_dir, filename, fill, s, jtol, mtol, invert, debug,
                r_idx, c_idx = reduce_index(table, rotated, r_idx, c_idx)
                table.cells[r_idx][c_idx].add_text(t.get_text().strip('\n'))

-        if fill:
-            table = fill(table, fill)
+        if f is not None:
+            table = fill(table, f=f)

        data = []
        for i in range(len(table.cells)):
            data.append([table.cells[i][j].get_text().strip().encode('utf-8')
-                         for j in range(len(table.cells[i]))])
+                        for j in range(len(table.cells[i]))])
        if rotated == 'left':
            data = zip(*data[::-1])
        elif rotated == 'right':
            data = zip(*data[::1])
            data.reverse()
        data = remove_empty(data)
-        csvname = filename.split(
-            '.')[0] + ('_table_%d' % (num_tables + 1)) + '.csv'
-        csvpath = os.path.join(pdf_dir, csvname)
-        with open(csvpath, 'w') as outfile:
-            writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
-            for d in data:
-                writer.writerow(d)
-            print "saved as", csvname
-            print
+        output['table_%d' % num_tables] = data
        num_tables += 1
+
+    return output
--- a/morph_transform.py
+++ b/morph_transform.py
@ -1,75 +0,0 @@
-import cv2
-import numpy as np
-
-
-def morph_transform(imagename, s, invert):
-    # http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
-    img = cv2.imread(imagename)
-    img_x, img_y = img.shape[1], img.shape[0]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    # empirical result taken from
-    # http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
-    if invert:
-        threshold = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
-    else:
-        threshold = cv2.adaptiveThreshold(np.invert(
-            gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -0.2)
-    vertical = threshold
-    horizontal = threshold
-
-    scale = s
-    verticalsize = vertical.shape[0] / scale
-    horizontalsize = horizontal.shape[1] / scale
-
-    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
-    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
-
-    vertical = cv2.erode(vertical, ver, (-1, -1))
-    vertical = cv2.dilate(vertical, ver, (-1, -1))
-
-    horizontal = cv2.erode(horizontal, hor, (-1, -1))
-    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
-
-    mask = vertical + horizontal
-    joints = np.bitwise_and(vertical, horizontal)
-    _, contours, _ = cv2.findContours(
-        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
-
-    tables = {}
-    for c in contours:
-        c_poly = cv2.approxPolyDP(c, 3, True)
-        x, y, w, h = cv2.boundingRect(c_poly)
-        # find number of non-zero values in joints using what boundingRect
-        # returns
-        roi = joints[y:y + h, x:x + w]
-        _, jc, _ = cv2.findContours(
-            roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
-        if len(jc) <= 4:  # remove contours with less than <=4 joints
-            continue
-        joint_coords = []
-        for j in jc:
-            jx, jy, jw, jh = cv2.boundingRect(j)
-            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
-            joint_coords.append((c1, c2))
-        tables[(x, y + h, x + w, y)] = joint_coords
-
-    v_segments, h_segments = [], []
-    _, vcontours, _ = cv2.findContours(
-        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    for vc in vcontours:
-        x, y, w, h = cv2.boundingRect(vc)
-        x1, x2 = x, x + w
-        y1, y2 = y, y + h
-        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
-
-    _, hcontours, _ = cv2.findContours(
-        horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    for hc in hcontours:
-        x, y, w, h = cv2.boundingRect(hc)
-        x1, x2 = x, x + w
-        y1, y2 = y, y + h
-        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
-
-    return tables, v_segments, h_segments
--- a/pdf.py
+++ b/pdf.py
@ -9,35 +9,86 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTChar, LTTextLineHorizontal


-def parse_text_basic(layout, t=None):
+def parse_text_stream(layout, t=None):
+    """Recursively parse pdf layout to get a list of
+    LTTextHorizontal objects.
+
+    Parameters
+    ----------
+    layout : object
+
+    t : list
+
+    Returns
+    -------
+    t : list
+    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
-            if type(obj) is LTTextLineHorizontal:
+            if isinstance(obj, LTTextLineHorizontal):
                t.append(obj)
            else:
-                t += parse_text_basic(obj)
+                t += parse_text_stream(obj)
    except AttributeError:
        pass
    return t


-def parse_text_spreadsheet(layout, t=None):
+def parse_text_lattice(layout, t=None):
+    """Recursively parse pdf layout to get a list of
+    LTChar objects.
+    
+    Parameters
+    ----------
+    layout : object
+
+    t : list
+
+    Returns
+    -------
+    t : list
+    """
    if t is None:
        t = []
    try:
        for obj in layout._objs:
-            if type(obj) is LTChar:
+            if isinstance(obj, LTChar):
                t.append(obj)
            else:
-                t += parse_text_spreadsheet(obj)
+                t += parse_text_lattice(obj)
    except AttributeError:
        pass
    return t


-def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
+def get_pdf_info(pdfname, method=None, char_margin=2.0, line_margin=0.5,
+                 word_margin=0.1):
+    """Get list of text objects along with pdf width and height.
+
+    Parameters
+    ----------
+    pdfname : string
+
+    method : string
+
+    char_margin : float
+
+    line_margin : float
+
+    word_margin : float
+
+    Returns
+    -------
+    text : list
+
+    pdf_x : int
+
+    pdf_y : int
+    """
+    if not method:
+        return None
    with open(pdfname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
@ -52,9 +103,9 @@ def get_pdf_info(pdfname, method, char_margin, line_margin, word_margin):
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
-            if method == 'basic':
-                text = parse_text_basic(layout)
-            elif method == 'spreadsheet':
-                text = parse_text_spreadsheet(layout)
+            if method == 'stream':
+                text = parse_text_stream(layout)
+            elif method == 'lattice':
+                text = parse_text_lattice(layout)
            pdf_x, pdf_y = layout.bbox[2], layout.bbox[3]
    return text, pdf_x, pdf_y
--- a/stream.py
+++ b/stream.py
@ -0,0 +1,143 @@
+import os
+import numpy as np
+
+from pdf import get_pdf_info
+
+
+def overlap(l):
+    """Groups overlapping columns and returns list with updated
+    columns boundaries.
+
+    Parameters
+    ----------
+    l : list
+        List of column x-coordinates.
+
+    Returns
+    -------
+    merged : list
+        List of merged column x-coordinates.
+    """
+    merged = []
+    for higher in l:
+        if not merged:
+            merged.append(higher)
+        else:
+            lower = merged[-1]
+            if higher[0] <= lower[1]:
+                upper_bound = max(lower[1], higher[1])
+                lower_bound = min(lower[0], higher[0])
+                merged[-1] = (lower_bound, upper_bound)
+            else:
+                merged.append(higher)
+    return merged
+
+
+def stream(filepath, ncolumns=0, columns=None, char_margin=2.0,
+           line_margin=0.5, word_margin=0.1, debug=False):
+    """Stream algorithm
+
+    Groups data returned by PDFMiner into rows and finds mode of the
+    number of elements in each row to guess number of columns.
+
+    Parameters
+    ----------
+    filepath : string
+
+    ncolumns : int, default: 0, optional
+        Number of columns.
+
+    columns : string, default: None, optional
+        Comma-separated list of column x-coordinates.
+
+    char_margin : float, default: 2.0, optional
+        Char margin. Chars closer than cmargin are grouped together
+        to form a word.
+
+    line_margin : float, default: 0.5, optional
+        Line margin. Lines closer than lmargin are grouped together
+        to form a textbox.
+
+    word_margin : float, default: 0.1, optional
+        Word margin. Insert blank spaces between chars if distance
+        between words is greater than word margin.
+
+    debug : bool, default: False, optional
+        Debug by visualizing textboxes.
+
+    Returns
+    -------
+    output : list
+    """
+    filename = os.path.basename(filepath)
+    print "working on", filename
+    text, __, __ = get_pdf_info(filepath, method='stream', char_margin=char_margin,
+                                line_margin=line_margin, word_margin=word_margin)
+    text.sort(key=lambda x: (-x.y0, x.x0))
+    y_last = 0
+    data = []
+    temp = []
+    elements = []
+    for t in text:
+        # is checking for upright necessary?
+        # if t.get_text().strip() and all([obj.upright for obj in t._objs if
+        # type(obj) is LTChar]):
+        if t.get_text().strip():
+            if not np.isclose(y_last, t.y0, atol=2):
+                y_last = t.y0
+                elements.append(len(temp))
+                data.append(temp)
+                temp = []
+            temp.append(t)
+
+    if debug:
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, aspect='equal')
+        xs, ys = [], []
+        for d in data:
+            for t in d:
+                xs.extend([t.x0, t.x1])
+                ys.extend([t.y0, t.y1])
+                ax.add_patch(
+                    patches.Rectangle(
+                        (t.x0, t.y0),
+                        t.x1 - t.x0,
+                        t.y1 - t.y0
+                    )
+                )
+        ax.set_xlim(min(xs) - 10, max(xs) + 10)
+        ax.set_ylim(min(ys) - 10, max(ys) + 10)
+        plt.show()
+        return None
+
+    if columns:
+        cols = [(float(columns[i]), float(columns[i + 1]))
+                for i in range(0, len(columns) - 1)]
+        cols = [(c[0] + c[1]) / 2.0 for c in cols]
+    else:
+        # a table can't have just 1 column, can it?
+        elements = filter(lambda x: x != 1, elements)
+        mode = ncolumns if ncolumns else max(set(elements), key=elements.count)
+        cols = [(t.x0, t.x1) for d in data for t in d if len(d) == mode]
+        cols = overlap(sorted(cols))
+        cols = [(c[0] + c[1]) / 2.0 for c in cols]
+
+    output = [['' for c in cols] for d in data]
+    for row, d in enumerate(data):
+        for t in d:
+            cog = (t.x0 + t.x1) / 2.0
+            diff = [(i, abs(cog - c)) for i, c in enumerate(cols)]
+            if diff:
+                idx = min(diff, key=lambda x: x[1])
+            else:
+                print "couldn't find a table on this page"
+                return None
+            if output[row][idx[0]]:
+                output[row][idx[0]] += ' ' + t.get_text().strip()
+            else:
+                output[row][idx[0]] = t.get_text().strip()
+
+    return output
--- a/table.py
+++ b/table.py
@ -4,14 +4,55 @@ from cell import Cell


 class Table:
+    """Table
+    
+    Parameters
+    ----------
+    columns : list
+        List of column x-coordinates.

+    rows : list
+        List of row y-coordinates.
+
+    Attributes
+    ----------
+    cells : list
+        2-D list of cell objects.
+
+    columns : list
+        List of column x-coordinates.
+
+    rows : list
+        List of row y-coordinates.
+    """
    def __init__(self, columns, rows):
+
        self.cells = [[Cell(c[0], r[1], c[1], r[0])
                       for c in columns] for r in rows]
        self.columns = columns
        self.rows = rows

-    def set_edges(self, vertical, horizontal, jtol):
+    def set_edges(self, vertical, horizontal, jtol=2):
+        """Set cell edges to True if corresponding line segments
+        are detected in the pdf image.
+
+        Parameters
+        ----------
+        vertical : list
+            List of vertical line segments.
+
+        horizontal : list
+            List of horizontal line segments.
+
+        jtol : int, default: 2, optional
+            Tolerance to account for when comparing joint and line
+            coordinates.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
        for v in vertical:
            # find closest x coord
            # iterate over y coords and find closest points
@ -117,6 +158,14 @@ class Table:
        return self

    def set_spanning(self):
+        """Set spanning values of a cell to True if it isn't
+        bounded by four edges.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
        for i in range(len(self.cells)):
            for j in range(len(self.cells[i])):
                bound = self.cells[i][j].get_bounded_edges()
@ -125,28 +174,38 @@ class Table:

                elif bound == 3:
                    if not self.cells[i][j].left:
-                        if self.cells[i][j].right and self.cells[i][j].top and self.cells[i][j].bottom:
+                        if (self.cells[i][j].right and
+                                self.cells[i][j].top and 
+                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_h = True

                    elif not self.cells[i][j].right:
-                        if self.cells[i][j].left and self.cells[i][j].top and self.cells[i][j].bottom:
+                        if (self.cells[i][j].left and
+                                self.cells[i][j].top and
+                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_h = True

                    elif not self.cells[i][j].top:
-                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].bottom:
+                        if (self.cells[i][j].left and
+                                self.cells[i][j].right and
+                                self.cells[i][j].bottom):
                            self.cells[i][j].spanning_v = True

                    elif not self.cells[i][j].bottom:
-                        if self.cells[i][j].left and self.cells[i][j].right and self.cells[i][j].top:
+                        if (self.cells[i][j].left and
+                                self.cells[i][j].right and
+                                self.cells[i][j].top):
                            self.cells[i][j].spanning_v = True

                elif bound == 2:
                    if self.cells[i][j].left and self.cells[i][j].right:
-                        if not self.cells[i][j].top and not self.cells[i][j].bottom:
+                        if (not self.cells[i][j].top and
+                                not self.cells[i][j].bottom):
                            self.cells[i][j].spanning_v = True

                    elif self.cells[i][j].top and self.cells[i][j].bottom:
-                        if not self.cells[i][j].left and not self.cells[i][j].right:
+                        if (not self.cells[i][j].left and
+                                not self.cells[i][j].right):
                            self.cells[i][j].spanning_h = True

        return self
--- a/utils.py
+++ b/utils.py
@ -2,16 +2,61 @@ import numpy as np


 def translate(x1, x2):
+    """Translate coordinate x2 by x1.
+
+    Parameters
+    ----------
+    x1 : float
+
+    x2 : float
+
+    Returns
+    -------
+    x2 : float
+    """
    x2 += x1
    return x2


 def scale(x, s):
+    """Scale coordinate x by scaling factor s.
+
+    Parameters
+    ----------
+    x : float
+
+    s : float
+
+    Returns
+    -------
+    x : float
+    """
    x *= s
    return x


 def rotate(x1, y1, x2, y2, angle):
+    """Rotate point x2, y2 about point x1, y1 by angle.
+
+    Parameters
+    ----------
+    x1 : float
+
+    y1 : float
+
+    x2 : float
+
+    y2 : float
+
+    angle : float
+        Angle in radians.
+
+    Returns
+    -------
+    xnew : float
+
+    ynew : float
+    """
    s = np.sin(angle)
    c = np.cos(angle)
    x2 = translate(-x1, x2)
@ -23,7 +68,20 @@ def rotate(x1, y1, x2, y2, angle):
    return xnew, ynew


-def remove_close_values(ar, mtol):
+def remove_close_values(ar, mtol=2):
+    """Remove values which are within a tolerance of mtol of another value
+    present in list.
+
+    Parameters
+    ----------
+    ar : list
+
+    mtol : int, default: 2, optional
+
+    Returns
+    -------
+    ret : list
+    """
    ret = []
    for a in ar:
        if not ret:
@ -37,7 +95,20 @@ def remove_close_values(ar, mtol):
    return ret


-def merge_close_values(ar, mtol):
+def merge_close_values(ar, mtol=2):
+    """Merge values which are within a tolerance of mtol by calculating
+    a moving mean.
+
+    Parameters
+    ----------
+    ar : list
+
+    mtol : int, default: 2, optional
+
+    Returns
+    -------
+    ret : list
+    """
    ret = []
    for a in ar:
        if not ret:
@ -53,18 +124,63 @@ def merge_close_values(ar, mtol):


 def get_row_idx(t, rows):
+    """Get index of the row in which the given object falls by
+    comparing their co-ordinates.
+
+    Parameters
+    ----------
+    t : object
+
+    rows : list
+
+    Returns
+    -------
+    r : int
+    """
    for r in range(len(rows)):
        if (t.y0 + t.y1) / 2.0 < rows[r][0] and (t.y0 + t.y1) / 2.0 > rows[r][1]:
            return r


 def get_column_idx(t, columns):
+    """Get index of the column in which the given object falls by
+    comparing their co-ordinates.
+
+    Parameters
+    ----------
+    t : object
+
+    columns : list
+
+    Returns
+    -------
+    c : int
+    """
    for c in range(len(columns)):
        if (t.x0 + t.x1) / 2.0 > columns[c][0] and (t.x0 + t.x1) / 2.0 < columns[c][1]:
            return c


 def reduce_index(t, rotated, r_idx, c_idx):
+    """Shift a text object if it lies within a spanning cell taking
+    in account table rotation.
+
+    Parameters
+    ----------
+    t : object
+
+    rotated : string
+
+    r_idx : int
+
+    c_idx : int
+
+    Returns
+    -------
+    r_idx : int
+
+    c_idx : int
+    """
    if not rotated:
        if t.cells[r_idx][c_idx].spanning_h:
            while not t.cells[r_idx][c_idx].left:
@ -90,6 +206,16 @@ def reduce_index(t, rotated, r_idx, c_idx):


 def outline(t):
+    """Light up table boundary.
+
+    Parameters
+    ----------
+    t : object
+
+    Returns
+    -------
+    t : object
+    """
    for i in range(len(t.cells)):
        t.cells[i][0].left = True
        t.cells[i][len(t.cells[i]) - 1].right = True
@ -99,7 +225,19 @@ def outline(t):
    return t


-def fill(t, f):
+def fill(t, f=None):
+    """Fill spanning cells.
+
+    Parameters
+    ----------
+    t : object
+
+    f : string, default: None, optional
+
+    Returns
+    -------
+    t : object
+    """
    if f == "h":
        for i in range(len(t.cells)):
            for j in range(len(t.cells[i])):
@ -124,6 +262,16 @@ def fill(t, f):


 def remove_empty(d):
+    """Remove empty rows and columns.
+
+    Parameters
+    ----------
+    d : list
+
+    Returns
+    -------
+    d : list
+    """
    for i, row in enumerate(d):
        if row == [''] * len(row):
            d.pop(i)