Add docstrings and update docs

2018-09-09 10:00:22 +05:30 · 2018-09-09 10:00:22 +05:30 · 9878de4dfc
parent 16c6b8d45d
commit 9878de4dfc
16 changed files with 997 additions and 421 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,5 @@ dist/
 .coverage

 .pytest_cache/
+_build/
+_static/
--- a/README.md
+++ b/README.md
@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
 >>> df = tables[0].df
 </pre>

-Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
-
-<pre>
-Camelot: PDF parsing made simpler!
-
-usage:
- camelot [options] &lt;method&gt; [&lt;args&gt;...]
-
-options:
- -h, --help                Show this screen.
- -v, --version             Show version.
- -V, --verbose             Verbose.
- -p, --pages &lt;pageno&gt;      Comma-separated list of page numbers.
-                           Example: -p 1,3-6,10  [default: 1]
- -P, --parallel            Parallelize the parsing process.
- -f, --format &lt;format&gt;     Output format. (csv,tsv,html,json,xlsx) [default: csv]
- -l, --log                 Log to file.
- -o, --output &lt;directory&gt;  Output directory.
- -M, --cmargin &lt;cmargin&gt;   Char margin. Chars closer than cmargin are
-                           grouped together to form a word. [default: 2.0]
- -L, --lmargin &lt;lmargin&gt;   Line margin. Lines closer than lmargin are
-                           grouped together to form a textbox. [default: 0.5]
- -W, --wmargin &lt;wmargin&gt;   Word margin. Insert blank spaces between chars
-                           if distance between words is greater than word
-                           margin. [default: 0.1]
- -J, --split_text          Split text lines if they span across multiple cells.
- -K, --flag_size           Flag substring if its size differs from the whole string.
-                           Useful for super and subscripts.
- -X, --print-stats         List stats on the parsing process.
- -Y, --save-stats          Save stats to a file.
- -Z, --plot &lt;dist&gt;         Plot distributions. (page,all,rc)
-
-camelot methods:
- lattice  Looks for lines between data.
- stream   Looks for spaces between data.
-
-See 'camelot &lt;method&gt; -h' for more information on a specific method.
-</pre>
-
 ## Dependencies

-Currently, camelot works under Python 2.7.
-
-The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/).
+The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).

 ## Installation

@ -78,22 +37,22 @@ pip install -U pip setuptools

 ### Installing dependencies

-numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager.
+tk and ghostscript can be installed using your system's default package manager.

 #### Linux

-* Arch Linux
-
-<pre>
-sudo pacman -S opencv tk ghostscript
-</pre>
-
 * Ubuntu

 <pre>
 sudo apt-get install python-opencv python-tk ghostscript
 </pre>

+* Arch Linux
+
+<pre>
+sudo pacman -S opencv tk ghostscript
+</pre>
+
 #### OS X

 <pre>
@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
 Finally, `cd` into the project directory and install by

 <pre>
-make install
+python setup.py install
 </pre>

 ## Development
@ -118,12 +77,12 @@ git clone https://github.com/socialcopsdev/camelot.git

 ### Contributing

-See [Contributing doc]().
+See [Contributing guidelines]().

 ### Testing

 <pre>
-make test
+python setup.py test
 </pre>

 ## License
--- a/camelot/core.py
+++ b/camelot/core.py
@ -8,9 +8,48 @@ import pandas as pd


 class Cell(object):
-    """
+    """Defines a cell in a table with coordinates relative to a
+    left-bottom origin. (pdf coordinate space)
+
+    Parameters
+    ----------
+    x1 : float
+        x-coordinate of left-bottom point.
+    y1 : float
+        y-coordinate of left-bottom point.
+    x2 : float
+        x-coordinate of right-top point.
+    y2 : float
+        y-coordinate of right-top point.
+
+    Attributes
+    ----------
+    lb : tuple
+        Tuple representing left-bottom coordinates.
+    lt : tuple
+        Tuple representing left-top coordinates.
+    rb : tuple
+        Tuple representing right-bottom coordinates.
+    rt : tuple
+        Tuple representing right-top coordinates.
+    left : bool
+        Whether or not cell is bounded on the left.
+    right : bool
+        Whether or not cell is bounded on the right.
+    top : bool
+        Whether or not cell is bounded on the top.
+    bottom : bool
+        Whether or not cell is bounded on the bottom.
+    hspan : bool
+        Whether or not cell spans horizontally.
+    vspan : bool
+        Whether or not cell spans vertically.
+    text : string
+        Text assigned to cell.
+    bound

    """
+
    def __init__(self, x1, y1, x2, y2):
        self.x1 = x1
        self.y1 = y1
@ -34,37 +73,48 @@ class Cell(object):

    @property
    def text(self):
-        """
-
-        Returns
-        -------
-
-        """
        return self._text

    @text.setter
    def text(self, t):
-        """
-
-        Parameters
-        ----------
-        t
-        """
        self._text = ''.join([self._text, t])

    @property
    def bound(self):
-        """
-
-        Returns
-        -------
-
+        """The number of sides on which the cell is bounded.
        """
        return self.top + self.bottom + self.left + self.right


 class Table(object):
-    """
+    """Defines a table with coordinates relative to a left-bottom
+    origin. (pdf coordinate space)
+
+    Parameters
+    ----------
+    cols : list
+        List of tuples representing column x-coordinates in increasing
+        order.
+    rows : list
+        List of tuples representing row y-coordinates in decreasing
+        order.
+
+    Attributes
+    ----------
+    df : object
+        pandas.DataFrame
+    shape : tuple
+        Shape of the table.
+    accuracy : float
+        Accuracy with which text was assigned to the cell.
+    whitespace : float
+        Percentage of whitespace in the table.
+    order : int
+        Table number on pdf page.
+    page : int
+        Pdf page number.
+    data
+    parsing_report

    """
    def __init__(self, cols, rows):
@ -84,11 +134,7 @@ class Table(object):

    @property
    def data(self):
-        """
-
-        Returns
-        -------
-
+        """Returns two-dimensional list of strings in table.
        """
        d = []
        for row in self.cells:
@ -97,11 +143,8 @@ class Table(object):

    @property
    def parsing_report(self):
-        """
-
-        Returns
-        -------
-
+        """Returns a parsing report with accuracy, %whitespace,
+        table number on page and page number.
        """
        # pretty?
        report = {
@ -112,27 +155,8 @@ class Table(object):
        }
        return report

-    def set_border(self):
-        """
-
-        Returns
-        -------
-
-        """
-        for r in range(len(self.rows)):
-            self.cells[r][0].left = True
-            self.cells[r][len(self.cols) - 1].right = True
-        for c in range(len(self.cols)):
-            self.cells[0][c].top = True
-            self.cells[len(self.rows) - 1][c].bottom = True
-        return self
-
    def set_all_edges(self):
-        """
-
-        Returns
-        -------
-
+        """Sets all table edges to True.
        """
        for row in self.cells:
            for cell in row:
@ -140,16 +164,16 @@ class Table(object):
        return self

    def set_edges(self, vertical, horizontal, joint_close_tol=2):
-        """
+        """Sets a cell's edges to True depending on whether the cell's
+        coordinates overlap with the line's coordinates within a
+        tolerance.

        Parameters
        ----------
-        vertical
-        horizontal
-        joint_close_tol
-
-        Returns
-        -------
+        vertical : list
+            List of detected vertical lines.
+        horizontal : list
+            List of detected horizontal lines.

        """
        for v in vertical:
@ -256,12 +280,20 @@ class Table(object):

        return self

-    def set_span(self):
+    def set_border(self):
+        """Sets table border edges to True.
        """
+        for r in range(len(self.rows)):
+            self.cells[r][0].left = True
+            self.cells[r][len(self.cols) - 1].right = True
+        for c in range(len(self.cols)):
+            self.cells[0][c].top = True
+            self.cells[len(self.rows) - 1][c].bottom = True
+        return self

-        Returns
-        -------
-
+    def set_span(self):
+        """Sets a cell's hspan or vspan attribute to True depending
+        on whether the cell spans horizontally or vertically.
        """
        for row in self.cells:
            for cell in row:
@ -288,6 +320,8 @@ class Table(object):
        return self

    def to_csv(self, path, **kwargs):
+        """Write Table to a comma-separated values (csv) file.
+        """
        kw = {
            'encoding': 'utf-8',
            'index': False,
@ -297,6 +331,8 @@ class Table(object):
        self.df.to_csv(path, **kw)

    def to_json(self, path, **kwargs):
+        """Write Table to a JSON file.
+        """
        kw = {
            'orient': 'records'
        }
@ -306,6 +342,8 @@ class Table(object):
            f.write(json_string)

    def to_excel(self, path, **kwargs):
+        """Write Table to an Excel file.
+        """
        kw = {
            'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
            'encoding': 'utf-8'
@ -316,13 +354,21 @@ class Table(object):
        writer.save()

    def to_html(self, path, **kwargs):
+        """Write Table to an HTML file.
+        """
        html_string = self.df.to_html(**kwargs)
        with open(path, 'w') as f:
            f.write(html_string)


 class TableList(object):
-    """
+    """Defines a list of camelot.core.Table objects. Each table can
+    be accessed using its index.
+
+    Attributes
+    ----------
+    n : int
+        Number of tables in the list.

    """
    def __init__(self, tables):
@ -371,6 +417,18 @@ class TableList(object):
                z.write(filepath, os.path.basename(filepath))

    def export(self, path, f='csv', compress=False):
+        """Exports the list of tables to specified file format.
+
+        Parameters
+        ----------
+        path : str
+            Filepath
+        f : str
+            File format. Can be csv, json, excel and html.
+        compress : bool
+            Whether or not to add files to a ZIP archive.
+
+        """
        dirname = os.path.dirname(path)
        basename = os.path.basename(path)
        root, ext = os.path.splitext(basename)
@ -402,9 +460,6 @@ class TableList(object):


 class Geometry(object):
-    """
-
-    """
    def __init__(self):
        self.text = []
        self.images = ()
@ -421,9 +476,6 @@ class Geometry(object):


 class GeometryList(object):
-    """
-
-    """
    def __init__(self, geometry):
        self.text = [g.text for g in geometry]
        self.images = [g.images for g in geometry]
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation


 class PDFHandler(object):
-    """
+    """Handles all operations like temp directory creation, splitting
+    file into single page pdfs, parsing each pdf and then removing the
+    temp directory.
+
+    Parameter
+    ---------
+    filename : str
+        Path to pdf file.
+    pages : str
+        Comma-separated page numbers to parse.
+        Example: 1,3,4 or 1,4-end

    """
    def __init__(self, filename, pages='1'):
        self.filename = filename
        if not self.filename.endswith('.pdf'):
            raise TypeError("File format not supported.")
-        self.pages = self.__get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filename, pages)
        self.tempdir = tempfile.mkdtemp()

-    def __get_pages(self, filename, pages):
-        # refactor
+    def _get_pages(self, filename, pages):
+        """Converts pages string to list of ints.
+
+        Parameters
+        ----------
+        filename : str
+            Path to pdf file.
+        pages : str
+            Comma-separated page numbers to parse.
+            Example: 1,3,4 or 1,4-end
+
+        Returns
+        -------
+        P : list
+            List of int page numbers.
+
+        """
        page_numbers = []
        if pages == '1':
            page_numbers.append({'start': 1, 'end': 1})
@ -42,8 +67,19 @@ class PDFHandler(object):
            P.extend(range(p['start'], p['end'] + 1))
        return sorted(set(P))

-    def __save_page(self, filename, page, temp):
-        # refactor
+    def _save_page(self, filename, page, temp):
+        """Saves specified page from pdf into a temporary directory.
+
+        Parameters
+        ----------
+        filename : str
+            Path to pdf file.
+        page : int
+            Page number
+        temp : str
+            Tmp directory
+
+        """
        with open(filename, 'rb') as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
@ -65,28 +101,37 @@ class PDFHandler(object):
                infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
                outfile = PdfFileWriter()
                p = infile.getPage(0)
-                if rotation == 'left':
+                if rotation == 'anticlockwise':
                    p.rotateClockwise(90)
-                elif rotation == 'right':
+                elif rotation == 'clockwise':
                    p.rotateCounterClockwise(90)
                outfile.addPage(p)
                with open(fpath, 'wb') as f:
                    outfile.write(f)

    def parse(self, mesh=False, **kwargs):
-        """
+        """Extracts tables by calling parser.get_tables on all single
+        page pdfs.

        Parameters
        ----------
-        mesh
-        kwargs
+        mesh : bool (default: False)
+            Whether or not to use Lattice method of parsing. Stream
+            is used by default.
+        kwargs : dict
+            See camelot.read_pdf kwargs.

        Returns
        -------
+        tables : camelot.core.TableList
+            List of tables found in pdf.
+        geometry : camelot.core.GeometryList
+            List of geometry objects (contours, lines, joints)
+            found in pdf.

        """
        for p in self.pages:
-            self.__save_page(self.filename, p, self.tempdir)
+            self._save_page(self.filename, p, self.tempdir)
        pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
                 for p in self.pages]
        tables = []
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -9,17 +9,31 @@ from .utils import merge_tuples


 def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
-    """
+    """Thresholds an image using OpenCV's adaptiveThreshold.

    Parameters
    ----------
-    imagename
-    process_background
-    blocksize
-    c
+    imagename : string
+        Path to image file.
+    process_background : bool, optional (default: False)
+        Whether or not to process lines that are in background.
+    blocksize : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    c : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.

    Returns
    -------
+    img : object
+        numpy.ndarray representing the original image.
+    threshold : object
+        numpy.ndarray representing the thresholded image.

    """
    img = cv2.imread(imagename)
@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):


 def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
-    """
+    """Finds horizontal and vertical lines by applying morphological
+    transformations on an image.

    Parameters
    ----------
-    threshold
-    direction
-    line_size_scaling
-    iterations
+    threshold : object
+        numpy.ndarray representing the thresholded image.
+    direction : string, optional (default: 'horizontal')
+        Specifies whether to find vertical or horizontal lines.
+    line_size_scaling : int, optional (default: 15)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.
+    iterations : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.

    Returns
    -------
+    dmask : object
+        numpy.ndarray representing pixels where vertical/horizontal
+        lines lie.
+    lines : list
+        List of tuples representing vertical/horizontal lines with
+        coordinates relative to a left-top origin in
+        image coordinate space.

    """
    lines = []
@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio


 def find_table_contours(vertical, horizontal):
-    """
+    """Finds table boundaries using OpenCV's findContours.

    Parameters
    ----------
-    vertical
-    horizontal
+    vertical : object
+        numpy.ndarray representing pixels where vertical lines lie.
+    horizontal : object
+        numpy.ndarray representing pixels where horizontal lines lie.

    Returns
    -------
+    cont : list
+        List of tuples representing table boundaries. Each tuple is of
+        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
+        h -> height in image coordinate space.

    """
    mask = vertical + horizontal
@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal):


 def find_table_joints(contours, vertical, horizontal):
-    """
+    """Finds joints/intersections present inside each table boundary.

    Parameters
    ----------
-    contours
-    vertical
-    horizontal
+    contours : list
+        List of tuples representing table boundaries. Each tuple is of
+        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
+        h -> height in image coordinate space.
+    vertical : object
+        numpy.ndarray representing pixels where vertical lines lie.
+    horizontal : object
+        numpy.ndarray representing pixels where horizontal lines lie.

    Returns
    -------
+    tables : dict
+        Dict with table boundaries as keys and list of intersections
+        in that boundary as their value.
+        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
+        and (x2, y2) -> rt in image coordinate space.

    """
    joints = np.bitwise_and(vertical, horizontal)
@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal):


 def remove_lines(threshold, line_size_scaling=15):
-    """
+    """Removes lines from a thresholded image.

    Parameters
    ----------
-    threshold
-    line_size_scaling
+    threshold : object
+        numpy.ndarray representing the thresholded image.
+    line_size_scaling : int, optional (default: 15)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.

    Returns
    -------
+    threshold : object
+        numpy.ndarray representing the thresholded image
+        with horizontal and vertical lines removed.

    """
    size = threshold.shape[0] // line_size_scaling
@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15):


 def find_cuts(threshold, char_size_scaling=200):
-    """
+    """Finds cuts made by text projections on y-axis.

    Parameters
    ----------
-    threshold
-    char_size_scaling
+    threshold : object
+        numpy.ndarray representing the thresholded image.
+    line_size_scaling : int, optional (default: 200)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.

    Returns
    -------
-
+    y_cuts : list
+        List of cuts on y-axis.
    """
    size = threshold.shape[0] // char_size_scaling
    char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
--- a/camelot/io.py
+++ b/camelot/io.py
@ -2,20 +2,93 @@ from .handlers import PDFHandler


 def read_pdf(filepath, pages='1', mesh=False, **kwargs):
-    """
+    """Read PDF and return parsed data tables.
+
+    Note: kwargs annotated with ^ can only be used with mesh=False
+    and kwargs annotated with * can only be used with mesh=True.

    Parameters
    ----------
-    filepath
-    pages
-    mesh
-    kwargs
+    filepath : str
+        Path to pdf file.
+    pages : str
+        Comma-separated page numbers to parse.
+        Example: 1,3,4 or 1,4-end
+    mesh : bool (default: False)
+        Whether or not to use Lattice method of parsing. Stream
+        is used by default.
+    table_area : list, optional (default: None)
+        List of table areas to analyze as strings of the form
+        x1,y1,x2,y2 where (x1, y1) -> left-top and
+        (x2, y2) -> right-bottom in pdf coordinate space.
+    columns^ : list, optional (default: None)
+        List of column x-coordinates as strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Whether or not to split a text line if it spans across
+        multiple cells.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.
+    row_close_tol^ : int, optional (default: 2)
+        Rows will be formed by combining text vertically
+        within this tolerance.
+    col_close_tol^ : int, optional (default: 0)
+        Columns will be formed by combining text horizontally
+        within this tolerance.
+    process_background* : bool, optional (default: False)
+        Whether or not to process lines that are in background.
+    line_size_scaling* : int, optional (default: 15)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.
+    copy_text* : list, optional (default: None)
+        {'h', 'v'}
+        Select one or more strings from above and pass them as a list
+        to specify the direction in which text should be copied over
+        when a cell spans multiple rows or columns.
+    shift_text* : list, optional (default: ['l', 't'])
+        {'l', 'r', 't', 'b'}
+        Select one or more strings from above and pass them as a list
+        to specify where the text in a spanning cell should flow.
+    line_close_tol* : int, optional (default: 2)
+        Tolerance parameter used to merge vertical and horizontal
+        detected lines which lie close to each other.
+    joint_close_tol* : int, optional (default: 2)
+        Tolerance parameter used to decide whether the detected lines
+        and points lie close to each other.
+    threshold_blocksize : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    threshold_constant : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    iterations : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+    margins : tuple
+        PDFMiner margins. (char_margin, line_margin, word_margin)
+
+        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
+    debug : bool, optional (default: False)
+        Whether or not to return all text objects on the page
+        which can be used to generate a matplotlib plot, to get
+        values for table_area(s) and debugging.

    Returns
    -------
+    tables : camelot.core.TableList

    """
-    # explicit type conversion
+    # validate kwargs?
    p = PDFHandler(filepath, pages)
    tables, __ = p.parse(mesh=mesh, **kwargs)
    return tables
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects


 class BaseParser(object):
-    """
-
+    """Defines a base parser.
    """
    def _generate_layout(self, filename):
        self.filename = filename
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -11,7 +11,7 @@ from .base import BaseParser
 from ..core import Table
 from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                     merge_close_lines, get_table_index, compute_accuracy,
-                     count_empty_strings, encode_, setup_logging)
+                     compute_whitespace, setup_logging, encode_)
 from ..image_processing import (adaptive_threshold, find_lines,
                                find_table_contours, find_table_joints)

@ -20,14 +20,74 @@ logger = setup_logging(__name__)


 class Lattice(BaseParser):
-    """
+    """Lattice method of parsing looks for lines between text
+    to form a table.
+
+    Parameters
+    ----------
+    table_area : list, optional (default: None)
+        List of table areas to analyze as strings of the form
+        x1,y1,x2,y2 where (x1, y1) -> left-top and
+        (x2, y2) -> right-bottom in pdf coordinate space.
+    process_background : bool, optional (default: False)
+        Whether or not to process lines that are in background.
+    line_size_scaling : int, optional (default: 15)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.
+    copy_text : list, optional (default: None)
+        {'h', 'v'}
+        Select one or more strings from above and pass them as a list
+        to specify the direction in which text should be copied over
+        when a cell spans multiple rows or columns.
+    shift_text : list, optional (default: ['l', 't'])
+        {'l', 'r', 't', 'b'}
+        Select one or more strings from above and pass them as a list
+        to specify where the text in a spanning cell should flow.
+    split_text : bool, optional (default: False)
+        Whether or not to split a text line if it spans across
+        multiple cells.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.
+    line_close_tol : int, optional (default: 2)
+        Tolerance parameter used to merge vertical and horizontal
+        detected lines which lie close to each other.
+    joint_close_tol : int, optional (default: 2)
+        Tolerance parameter used to decide whether the detected lines
+        and points lie close to each other.
+    threshold_blocksize : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    threshold_constant : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    iterations : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+    margins : tuple
+        PDFMiner margins. (char_margin, line_margin, word_margin)
+
+        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
+    debug : bool, optional (default: False)
+        Whether or not to return all text objects on the page
+        which can be used to generate a matplotlib plot, to get
+        values for table_area(s) and debugging.

    """
    def __init__(self, table_area=None, process_background=False,
                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, line_close_tol=2,
                 joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
-                 iterations=0, margins=(1.0, 0.5, 0.1), debug=None):
+                 iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
        self.table_area = table_area
        self.process_background = process_background
        self.line_size_scaling = line_size_scaling
@ -45,6 +105,27 @@ class Lattice(BaseParser):

    @staticmethod
    def _reduce_index(t, idx, shift_text):
+        """Reduces index of a text object if it lies within a spanning
+        cell.
+
+        Parameters
+        ----------
+        table : camelot.core.Table
+        idx : list
+            List of tuples of the form (r_idx, c_idx, text).
+        shift_text : list
+            {'l', 'r', 't', 'b'}
+            Select one or more strings from above and pass them as a
+            list to specify where the text in a spanning cell should
+            flow.
+
+        Returns
+        -------
+        indices : list
+            List of tuples of the form (r_idx, c_idx, text) where
+            r_idx and c_idx are new row and column indices for text.
+
+        """
        indices = []
        for r_idx, c_idx, text in idx:
            for d in shift_text:
@ -69,6 +150,22 @@ class Lattice(BaseParser):

    @staticmethod
    def _copy_spanning_text(t, copy_text=None):
+        """Copies over text in empty spanning cells.
+
+        Parameters
+        ----------
+        t : camelot.core.Table
+        copy_text : list, optional (default: None)
+            {'h', 'v'}
+            Select one or more strings from above and pass them as a list
+            to specify the direction in which text should be copied over
+            when a cell spans multiple rows or columns.
+
+        Returns
+        -------
+        t : camelot.core.Table
+
+        """
        for f in copy_text:
            if f == "h":
                for i in range(len(t.cells)):
@ -199,7 +296,7 @@ class Lattice(BaseParser):
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape

-        whitespace, __, __ = count_empty_strings(data)
+        whitespace = compute_whitespace(data)
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
@ -208,16 +305,6 @@ class Lattice(BaseParser):
        return table

    def extract_tables(self, filename):
-        """
-
-        Parameters
-        ----------
-        filename
-
-        Returns
-        -------
-
-        """
        logger.info('Processing {}'.format(os.path.basename(filename)))
        self._generate_layout(filename)

@ -237,7 +324,7 @@ class Lattice(BaseParser):
            table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
            _tables.append(table)

-        if self.debug is not None:
+        if self.debug:
            text = []
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -8,19 +8,54 @@ import pandas as pd
 from .base import BaseParser
 from ..core import Table
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
-                     count_empty_strings, encode_, setup_logging)
+                     compute_whitespace, setup_logging, encode_)


 logger = setup_logging(__name__)


 class Stream(BaseParser):
-    """
+    """Stream method of parsing looks for spaces between text
+    to form a table.
+
+    If you want to specify columns when specifying multiple table
+    areas, make sure that the length of both lists are equal.
+
+    Parameters
+    ----------
+    table_area : list, optional (default: None)
+        List of table areas to analyze as strings of the form
+        x1,y1,x2,y2 where (x1, y1) -> left-top and
+        (x2, y2) -> right-bottom in pdf coordinate space.
+    columns : list, optional (default: None)
+        List of column x-coordinates as strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Whether or not to split a text line if it spans across
+        multiple cells.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.
+    row_close_tol : int, optional (default: 2)
+        Rows will be formed by combining text vertically
+        within this tolerance.
+    col_close_tol : int, optional (default: 0)
+        Columns will be formed by combining text horizontally
+        within this tolerance.
+    margins : tuple, optional (default: (1.0, 0.5, 0.1))
+        PDFMiner margins. (char_margin, line_margin, word_margin)
+
+        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
+    debug : bool, optional (default: False)
+        Whether or not to return all text objects on the page
+        which can be used to generate a matplotlib plot, to get
+        values for table_area(s), columns and debugging.

    """
    def __init__(self, table_area=None, columns=None, split_text=False,
                 flag_size=False, row_close_tol=2, col_close_tol=0,
-                 margins=(1.0, 0.5, 0.1), debug=None):
+                 margins=(1.0, 0.5, 0.1), debug=False):
        self.table_area = table_area
        self.columns = columns
        self._validate_columns()
@ -33,6 +68,20 @@ class Stream(BaseParser):

    @staticmethod
    def _text_bbox(t_bbox):
+        """Returns bounding box for the text present on a page.
+
+        Parameters
+        ----------
+        t_bbox : dict
+            Dict with two keys 'horizontal' and 'vertical' with lists of
+            LTTextLineHorizontals and LTTextLineVerticals respectively.
+
+        Returns
+        -------
+        text_bbox : tuple
+            Tuple (x0, y0, x1, y1) in pdf coordinate space.
+
+        """
        xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
        ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
        xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
@ -42,6 +91,21 @@ class Stream(BaseParser):

    @staticmethod
    def _group_rows(text, row_close_tol=2):
+        """Groups PDFMiner text objects into rows vertically
+        within a tolerance.
+
+        Parameters
+        ----------
+        text : list
+            List of PDFMiner text objects.
+        row_close_tol : int, optional (default: 2)
+
+        Returns
+        -------
+        rows : list
+            Two-dimensional list of text objects grouped into rows.
+
+        """
        row_y = 0
        rows = []
        temp = []
@ -61,6 +125,21 @@ class Stream(BaseParser):

    @staticmethod
    def _merge_columns(l, col_close_tol=0):
+        """Merges column boundaries horizontally if they overlap
+        or lie within a tolerance.
+
+        Parameters
+        ----------
+        l : list
+            List of column x-coordinate tuples.
+        col_close_tol : int, optional (default: 0)
+
+        Returns
+        -------
+        merged : list
+            List of merged column x-coordinate tuples.
+
+        """
        merged = []
        for higher in l:
            if not merged:
@ -89,6 +168,21 @@ class Stream(BaseParser):

    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
+        """Makes row coordinates continuous.
+
+        Parameters
+        ----------
+        rows_grouped : list
+            Two-dimensional list of text objects grouped into rows.
+        text_y_max : int
+        text_y_min : int
+
+        Returns
+        -------
+        rows : list
+            List of continuous row y-coordinate tuples.
+
+        """
        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                    if len(r) > 0 else 0 for r in rows_grouped]
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
@ -100,6 +194,23 @@ class Stream(BaseParser):

    @staticmethod
    def _add_columns(cols, text, row_close_tol):
+        """Adds columns to existing list by taking into account
+        the text that lies outside the current column x-coordinates.
+
+        Parameters
+        ----------
+        cols : list
+            List of column x-coordinate tuples.
+        text : list
+            List of PDFMiner text objects.
+        ytol : int
+
+        Returns
+        -------
+        cols : list
+            Updated list of column x-coordinate tuples.
+
+        """
        if text:
            text = Stream._group_rows(text, row_close_tol=row_close_tol)
            elements = [len(r) for r in text]
@ -110,6 +221,21 @@ class Stream(BaseParser):

    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
+        """Makes column coordinates continuous.
+
+        Parameters
+        ----------
+        cols : list
+            List of column x-coordinate tuples.
+        text_x_min : int
+        text_y_max : int
+
+        Returns
+        -------
+        cols : list
+            Updated list of column x-coordinate tuples.
+
+        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
@ -207,7 +333,7 @@ class Stream(BaseParser):
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape

-        whitespace, __, __ = count_empty_strings(data)
+        whitespace = compute_whitespace(data)
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
@ -216,16 +342,6 @@ class Stream(BaseParser):
        return table

    def extract_tables(self, filename):
-        """
-
-        Parameters
-        ----------
-        filename
-
-        Returns
-        -------
-
-        """
        logger.info('Processing {}'.format(os.path.basename(filename)))
        self._generate_layout(filename)

@ -244,7 +360,7 @@ class Stream(BaseParser):
            table = self._generate_table(table_idx, cols, rows)
            _tables.append(table)

-        if self.debug is not None:
+        if self.debug:
            text = []
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -6,19 +6,101 @@ from .handlers import PDFHandler


 def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
-    """
+    """Plot geometry found on pdf page based on type specified,
+    useful for debugging and playing with different parameters to get
+    the best output.
+
+    Note: kwargs annotated with ^ can only be used with mesh=False
+    and kwargs annotated with * can only be used with mesh=True.

    Parameters
    ----------
-    filepath
-    pages
-    mesh
-    geometry_type
-    kwargs
+    filepath : str
+        Path to pdf file.
+    pages : str
+        Comma-separated page numbers to parse.
+        Example: 1,3,4 or 1,4-end
+    mesh : bool (default: False)
+        Whether or not to use Lattice method of parsing. Stream
+        is used by default.
+    geometry_type : str, optional (default: 'text')
+        'text' : Plot text objects found on page, useful to get
+                 table_area and columns coordinates.
+        'table' : Plot parsed table.
+        'contour'* : Plot detected rectangles.
+        'joint'* : Plot detected line intersections.
+        'line'* : Plot detected lines.
+    table_area : list, optional (default: None)
+        List of table areas to analyze as strings of the form
+        x1,y1,x2,y2 where (x1, y1) -> left-top and
+        (x2, y2) -> right-bottom in pdf coordinate space.
+    columns^ : list, optional (default: None)
+        List of column x-coordinates as strings where the coordinates
+        are comma-separated.
+    split_text : bool, optional (default: False)
+        Whether or not to split a text line if it spans across
+        multiple cells.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.
+    row_close_tol^ : int, optional (default: 2)
+        Rows will be formed by combining text vertically
+        within this tolerance.
+    col_close_tol^ : int, optional (default: 0)
+        Columns will be formed by combining text horizontally
+        within this tolerance.
+    process_background* : bool, optional (default: False)
+        Whether or not to process lines that are in background.
+    line_size_scaling* : int, optional (default: 15)
+        Factor by which the page dimensions will be divided to get
+        smallest length of lines that should be detected.
+
+        The larger this value, smaller the detected lines. Making it
+        too large will lead to text being detected as lines.
+    copy_text* : list, optional (default: None)
+        {'h', 'v'}
+        Select one or more strings from above and pass them as a list
+        to specify the direction in which text should be copied over
+        when a cell spans multiple rows or columns.
+    shift_text* : list, optional (default: ['l', 't'])
+        {'l', 'r', 't', 'b'}
+        Select one or more strings from above and pass them as a list
+        to specify where the text in a spanning cell should flow.
+    line_close_tol* : int, optional (default: 2)
+        Tolerance parameter used to merge vertical and horizontal
+        detected lines which lie close to each other.
+    joint_close_tol* : int, optional (default: 2)
+        Tolerance parameter used to decide whether the detected lines
+        and points lie close to each other.
+    threshold_blocksize : int, optional (default: 15)
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    threshold_constant : int, optional (default: -2)
+        Constant subtracted from the mean or weighted mean.
+        Normally, it is positive but may be zero or negative as well.
+
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
+    iterations : int, optional (default: 0)
+        Number of times for erosion/dilation is applied.
+
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
+    margins : tuple
+        PDFMiner margins. (char_margin, line_margin, word_margin)
+
+        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
+    debug : bool, optional (default: False)
+        Whether or not to return all text objects on the page
+        which can be used to generate a matplotlib plot, to get
+        values for table_area(s) and debugging.
+
    """
-    # explicit type conversion
+    # validate kwargs?
    p = PDFHandler(filepath, pages)
-    kwargs.update({'debug': geometry_type})
+    debug = True if geometry_type else False
+    kwargs.update({'debug': debug})
    __, geometry = p.parse(mesh=mesh, **kwargs)

    if geometry_type == 'text':
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,


 def setup_logging(name):
-    """
+    """Sets up a logger with StreamHandler.

    Parameters
    ----------
-    name
+    name : str

    Returns
    -------
+    logger : logging.Logger

    """
    logger = logging.getLogger(name)
@ -47,15 +48,16 @@ logger = setup_logging(__name__)


 def translate(x1, x2):
-    """
+    """Translates x2 by x1.

    Parameters
    ----------
-    x1
-    x2
+    x1 : float
+    x2 : float

    Returns
    -------
+    x2 : float

    """
    x2 += x1
@ -63,15 +65,16 @@ def translate(x1, x2):


 def scale(x, s):
-    """
+    """Scales x by scaling factor s.

    Parameters
    ----------
-    x
-    s
+    x : float
+    s : float

    Returns
    -------
+    x : float

    """
    x *= s
@ -79,18 +82,21 @@ def scale(x, s):


 def rotate(x1, y1, x2, y2, angle):
-    """
+    """Rotates point x2, y2 about point x1, y1 by angle.

    Parameters
    ----------
-    x1
-    y1
-    x2
-    y2
-    angle
+    x1 : float
+    y1 : float
+    x2 : float
+    y2 : float
+    angle : float
+        Angle in radians.

    Returns
    -------
+    xnew : float
+    ynew : float

    """
    s = np.sin(angle)
@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle):


 def scale_pdf(k, factors):
-    """
+    """Translates and scales pdf coordinate space to image
+    coordinate space.

    Parameters
    ----------
-    k
-    factors
+    k : tuple
+        Tuple (x1, y1, x2, y2) representing table bounding box where
+        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
+        space.
+    factors : tuple
+        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
+        first two elements are scaling factors and pdf_y is height of
+        pdf.

    Returns
    -------
+    knew : tuple
+        Tuple (x1, y1, x2, y2) representing table bounding box where
+        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
+        space.

    """
    x1, y1, x2, y2 = k
@ -127,17 +144,28 @@ def scale_pdf(k, factors):


 def scale_image(tables, v_segments, h_segments, factors):
-    """
+    """Translates and scales image coordinate space to pdf
+    coordinate space.

    Parameters
    ----------
-    tables
-    v_segments
-    h_segments
-    factors
+    tables : dict
+        Dict with table boundaries as keys and list of intersections
+        in that boundary as value.
+    v_segments : list
+        List of vertical line segments.
+    h_segments : list
+        List of horizontal line segments.
+    factors : tuple
+        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
+        first two elements are scaling factors and img_y is height of
+        image.

    Returns
    -------
+    tables_new : dict
+    v_segments_new : dict
+    h_segments_new : dict

    """
    scaling_factor_x, scaling_factor_y, img_y = factors
@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors):


 def get_rotation(lttextlh, lttextlv, ltchar):
-    """
+    """Detects if text in table is rotated or not using the current
+    transformation matrix (CTM) and returns its orientation.

    Parameters
    ----------
-    lttextlh
-    lttextlv
-    ltchar
+    lttextlh : list
+        List of PDFMiner LTTextLineHorizontal objects.
+    lttextlv : list
+        List of PDFMiner LTTextLineVertical objects.
+    ltchar : list
+        List of PDFMiner LTChar objects.

    Returns
    -------
+    rotation : string
+        '' if text in table is upright, 'left' if rotated 90 degree
+        anticlockwise and 'right' if rotated 90 degree clockwise.

    """
    rotation = ''
@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar):
    if hlen < vlen:
        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
-        rotation = 'left' if clockwise < anticlockwise else 'right'
+        rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise'
    return rotation


 def segments_in_bbox(bbox, v_segments, h_segments):
-    """
+    """Returns all line segments present inside a bounding box.

    Parameters
    ----------
-    bbox
-    v_segments
-    h_segments
+    bbox : tuple
+        Tuple (x1, y1, x2, y2) representing a bounding box where
+        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        space.
+    v_segments : list
+        List of vertical line segments.
+    h_segments : list
+        List of vertical horizontal segments.

    Returns
    -------
+    v_s : list
+        List of vertical line segments that lie inside table.
+    h_s : list
+        List of horizontal line segments that lie inside table.

    """
    lb = (bbox[0], bbox[1])
@ -217,15 +261,20 @@ def segments_in_bbox(bbox, v_segments, h_segments):


 def text_in_bbox(bbox, text):
-    """
+    """Returns all text objects present inside a bounding box.

    Parameters
    ----------
-    bbox
-    text
+    bbox : tuple
+        Tuple (x1, y1, x2, y2) representing a bounding box where
+        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        space.
+    text : List of PDFMiner text objects.

    Returns
    -------
+    t_bbox : list
+        List of PDFMiner text objects that lie inside table.

    """
    lb = (bbox[0], bbox[1])
@ -237,15 +286,17 @@ def text_in_bbox(bbox, text):


 def remove_close_lines(ar, line_close_tol=2):
-    """
+    """Removes lines which are within a tolerance, based on their x or
+    y axis projections.

    Parameters
    ----------
-    ar
-    line_close_tol
+    ar : list
+    line_close_tol : int, optional (default: 2)

    Returns
    -------
+    ret : list

    """
    ret = []
@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2):


 def merge_close_lines(ar, line_close_tol=2):
-    """
+    """Merges lines which are within a tolerance by calculating a
+    moving mean, based on their x or y axis projections.

    Parameters
    ----------
-    ar
-    line_close_tol
+    ar : list
+    line_close_tol : int, optional (default: 2)

    Returns
    -------
+    ret : list

    """
    ret = []
@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2):


 def flag_font_size(textline, direction):
-    """
+    """Flags super/subscripts in text by enclosing them with <s></s>.
+    May give false positives.

    Parameters
    ----------
-    textline
-    direction
+    textline : list
+        List of PDFMiner LTChar objects.
+    direction : string
+        Direction of the PDFMiner LTTextLine object.

    Returns
    -------
+    fstring : string

    """
    if direction == 'horizontal':
@ -324,18 +381,27 @@ def flag_font_size(textline, direction):
    return fstring


-def split_textline(table, textline, direction, flag_size=True):
-    """
+def split_textline(table, textline, direction, flag_size=False):
+    """Splits PDFMiner LTTextLine into substrings if it spans across
+    multiple rows/columns.

    Parameters
    ----------
-    table
-    textline
-    direction
-    flag_size
+    table : camelot.core.Table
+    textline : object
+        PDFMiner LTTextLine object.
+    direction : string
+        Direction of the PDFMiner LTTextLine object.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.

    Returns
    -------
+    grouped_chars : list
+        List of tuples of the form (idx, text) where idx is the index
+        of row/column and text is the an lttextline substring.

    """
    idx = 0
@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True):
    return grouped_chars


-def get_table_index(table, t, direction, split_text=False, flag_size=True):
-    """
+def get_table_index(table, t, direction, split_text=False, flag_size=False):
+    """Gets indices of the table cell where given text object lies by
+    comparing their y and x-coordinates.

    Parameters
    ----------
-    table
-    t
-    direction
-    split_text
-    flag_size
+    table : camelot.core.Table
+    t : object
+        PDFMiner LTTextLine object.
+    direction : string
+        Direction of the PDFMiner LTTextLine object.
+    split_text : bool, optional (default: False)
+        Whether or not to split a text line if it spans across
+        multiple cells.
+    flag_size : bool, optional (default: False)
+        Whether or not to highlight a substring using <s></s>
+        if its size is different from rest of the string, useful for
+        super and subscripts.

    Returns
    -------
+    indices : list
+        List of tuples of the form (r_idx, c_idx, text) where r_idx
+        and c_idx are row and column indices.
+    error : float
+        Assignment error, percentage of text area that lies outside
+        a cell.
+        +-------+
+        |       |
+        |   [Text bounding box]
+        |       |
+        +-------+

    """
    r_idx, c_idx = [-1] * 2
@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):


 def compute_accuracy(error_weights):
-    """
+    """Calculates a score based on weights assigned to various
+    parameters and their error percentages.

    Parameters
    ----------
-    error_weights
+    error_weights : list
+        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
+        where pn is the weight assigned to list of errors en.
+        Sum of pn should be equal to 100.

    Returns
    -------
+    score : float

    """
    SCORE_VAL = 100
@ -474,50 +564,40 @@ def compute_accuracy(error_weights):
    return score


-def count_empty_strings(d):
-    """
+def compute_whitespace(d):
+    """Calculates the percentage of empty strings in a
+    two-dimensional list.

    Parameters
    ----------
-    d
+    d : list

    Returns
    -------
+    whitespace : float
+        Percentage of empty cells.

    """
-    empty_p = 0
+    whitespace = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
        for j in i:
            if j.strip() == '':
-                empty_p += 1
-    empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
-    for row in d:
-        r_nempty_c = 0
-        for r in row:
-            if r.strip() != '':
-                r_nempty_c += 1
-        r_nempty_cells.append(r_nempty_c)
-    d = zip(*d)
-    d = [list(col) for col in d]
-    for col in d:
-        c_nempty_c = 0
-        for c in col:
-            if c.strip() != '':
-                c_nempty_c += 1
-        c_nempty_cells.append(c_nempty_c)
-    return empty_p, r_nempty_cells, c_nempty_cells
+                whitespace += 1
+    whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
+    return whitespace


-def remove_empty_strings(d):
-    """
+def remove_empty(d):
+    """Removes empty rows and columns from a two-dimensional list.

    Parameters
    ----------
-    d
+    d : list

    Returns
    -------
+    d : list

    """
    for i, row in enumerate(d):
@ -530,70 +610,46 @@ def remove_empty_strings(d):


 def encode_(ar):
-    """
+    """Encodes two-dimensional list into unicode.

    Parameters
    ----------
-    ar
+    ar : list

    Returns
    -------
+    ar : list

    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar


-def get_text_objects(layout, ltype="char", t=None):
-    """
-
-    Parameters
-    ----------
-    layout
-    ltype
-    t
-
-    Returns
-    -------
-
-    """
-    if ltype == "char":
-        LTObject = LTChar
-    elif ltype == "lh":
-        LTObject = LTTextLineHorizontal
-    elif ltype == "lv":
-        LTObject = LTTextLineVertical
-    if t is None:
-        t = []
-    try:
-        for obj in layout._objs:
-            if isinstance(obj, LTObject):
-                t.append(obj)
-            else:
-                t += get_text_objects(obj, ltype=ltype)
-    except AttributeError:
-        pass
-    return t
-
-
-def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
+def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
               detect_vertical=True, all_texts=True):
-    """
+    """Returns a PDFMiner LTPage object and page dimension of a single
+    page pdf. See https://euske.github.io/pdfminer/ to get definitions
+    of kwargs.

    Parameters
    ----------
-    pname
-    char_margin
-    line_margin
-    word_margin
-    detect_vertical
-    all_texts
+    filename : string
+        Path to pdf file.
+    char_margin : float
+    line_margin : float
+    word_margin : float
+    detect_vertical : bool
+    all_texts : bool

    Returns
    -------
+    layout : object
+        PDFMiner LTPage object.
+    dim : tuple
+        Dimension of pdf page in the form (width, height).

    """
-    with open(pname, 'r') as f:
+    with open(filename, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
        return layout, dim


-def merge_tuples(tuples):
-    """
+def get_text_objects(layout, ltype="char", t=None):
+    """Recursively parses pdf layout to get a list of
+    PDFMiner text objects.

    Parameters
    ----------
-    tuples
+    layout : object
+        PDFMiner LTPage object.
+    ltype : string
+        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
+        and LTTextLineVertical objects respectively.
+    t : list
+
+    Returns
+    -------
+    t : list
+        List of PDFMiner text objects.
+
+    """
+    if ltype == "char":
+        LTObject = LTChar
+    elif ltype == "lh":
+        LTObject = LTTextLineHorizontal
+    elif ltype == "lv":
+        LTObject = LTTextLineVertical
+    if t is None:
+        t = []
+    try:
+        for obj in layout._objs:
+            if isinstance(obj, LTObject):
+                t.append(obj)
+            else:
+                t += get_text_objects(obj, ltype=ltype)
+    except AttributeError:
+        pass
+    return t
+
+
+def merge_tuples(tuples):
+    """Merges a list of overlapping tuples.
+
+    Parameters
+    ----------
+    tuples : list
+        List of tuples where a tuple is a single axis coordinate pair.
+
+    Yields
+    ------
+    tuple
+
    """
    merged = list(tuples[0])
    for s, e in tuples:
--- a/docs/api.rst
+++ b/docs/api.rst
@ -4,17 +4,37 @@
 API Reference
 =============

-Pdf
-===
-.. automodule:: camelot.pdf
+camelot.read_pdf
+================
+.. automodule:: camelot.read_pdf
   :members:

-Lattice
-=======
-.. automodule:: camelot.lattice
+camelot.handlers.PDFHandler
+===========================
+.. automodule:: camelot.handlers.PDFHandler
   :members:

-Stream
-======
-.. automodule:: camelot.stream
+camelot.parsers.Stream
+======================
+.. automodule:: camelot.parsers.Stream
+   :members:
+
+camelot.parsers.Lattice
+=======================
+.. automodule:: camelot.parsers.Lattice
+   :members:
+
+camelot.core.Cell
+=================
+.. automodule:: camelot.core.Cell
+   :members:
+
+camelot.core.Table
+==================
+.. automodule:: camelot.core.Table
+   :members:
+
+camelot.core.TableList
+======================
+.. automodule:: camelot.core.TableList
   :members:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -3,11 +3,11 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

-==================================
-Camelot: pdf parsing made simpler!
-==================================
+=====================================
+Camelot: PDF Table Parsing for Humans
+=====================================

-Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files.
+Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.

 Why another pdf table parsing library?
 ======================================
@ -32,12 +32,22 @@ Usage

 ::

-    >>> from camelot.pdf import Pdf
-    >>> from camelot.lattice import Lattice
-
-    >>> manager = Pdf(Lattice(), 'us-030.pdf')
-    >>> tables = manager.extract()
-    >>> print tables['page-1']['table-1']['data']
+    >>> import camelot
+    >>> tables = camelot.read_pdf("foo.pdf")
+    >>> tables
+    <TableList n=2>
+    >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
+    >>> tables[0]
+    <Table shape=(3,4)>
+    >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
+    >>> tables[0].parsing_report
+    {
+        "accuracy": 96,
+        "whitespace": 80,
+        "order": 1,
+        "page": 1
+    }
+    >>> df = tables[0].df

 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -49,45 +59,6 @@ Usage
   "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
   "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"

-Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
-
-::
-
-    Camelot: PDF parsing made simpler!
-
-    usage:
-     camelot [options] <method> [<args>...]
-
-    options:
-     -h, --help                Show this screen.
-     -v, --version             Show version.
-     -V, --verbose             Verbose.
-     -p, --pages <pageno>      Comma-separated list of page numbers.
-                               Example: -p 1,3-6,10  [default: 1]
-     -P, --parallel            Parallelize the parsing process.
-     -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
-     -l, --log                 Log to file.
-     -o, --output <directory>  Output directory.
-     -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
-                               grouped together to form a word. [default: 1.0]
-     -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
-                               grouped together to form a textbox. [default: 0.5]
-     -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
-                               if distance between words is greater than word
-                               margin. [default: 0.1]
-     -J, --split_text          Split text lines if they span across multiple cells.
-     -K, --flag_size           Flag substring if its size differs from the whole string.
-                               Useful for super and subscripts.
-     -X, --print-stats         List stats on the parsing process.
-     -Y, --save-stats          Save stats to a file.
-     -Z, --plot <dist>         Plot distributions. (page,all,rc)
-
-    camelot methods:
-     lattice  Looks for lines between data.
-     stream   Looks for spaces between data.
-
-    See 'camelot <method> -h' for more information on a specific method.
-
 Installation
 ============

@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can

    pip install -U pip setuptools

-The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_.
+The dependencies include `tk`_ and `ghostscript`_.

-.. _numpy: http://www.numpy.org/
-.. _OpenCV: http://opencv.org/
-.. _ImageMagick: http://www.imagemagick.org/script/index.php
+.. _tk: https://wiki.tcl.tk/3743
+.. _ghostscript: https://www.ghostscript.com/

 Installing dependencies
 -----------------------

-numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
+tk and ghostscript can be installed using your system's default package manager.

 Linux
 ^^^^^

-* Arch Linux
-
-::
-
-    sudo pacman -S opencv imagemagick
-
 * Ubuntu

 ::

-    sudo apt-get install libopencv-dev python-opencv imagemagick
+    sudo apt-get install python-opencv python-tk ghostscript
+
+* Arch Linux
+
+::
+
+    sudo pacman -S opencv tk ghostscript

 OS X
 ^^^^

 ::

-    brew install homebrew/science/opencv imagemagick
+    brew install homebrew/science/opencv ghostscript

 Finally, `cd` into the project directory and install by::

-    make install
+    python setup.py install

 API Reference
 =============
@ -150,14 +120,14 @@ You can check the latest sources with the command::
 Contributing
 ------------

-See :doc:`Contributing doc <contributing>`.
+See :doc:`Contributing guidelines <contributing>`.

 Testing
 -------

 ::

-    make test
+    python setup.py test

 License
 =======
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -0,0 +1,11 @@
+click==6.7
+matplotlib==2.2.3
+numpy==1.13.3
+opencv-python==3.4.2.17
+pandas==0.23.4
+pdfminer==20140328
+Pillow==5.2.0
+PyPDF2==1.26.0
+pytest==3.8.0
+pytest-runner==4.2
+Sphinx==1.8.0b1
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,8 @@
-docopt==0.6.2
+click==6.7
 matplotlib==2.2.3
-nose==1.3.7
+numpy==1.13.3
+opencv-python==3.4.2.17
+pandas==0.23.4
 pdfminer==20140328
-pyexcel-xlsx==0.5.6
 Pillow==5.2.0
 PyPDF2==1.26.0
-Sphinx==1.8.0b1
--- a/setup.py
+++ b/setup.py
@ -4,12 +4,12 @@ import camelot

 NAME = 'camelot'
 VERSION = camelot.__version__
-DESCRIPTION = 'camelot parses tables from PDFs!'
+DESCRIPTION = 'PDF Table Parsing for Humans'
 with open('README.md') as f:
    LONG_DESCRIPTION = f.read()
 URL = 'https://github.com/socialcopsdev/camelot'
 AUTHOR = 'Vinayak Mehta'
-AUTHOR_EMAIL = 'vinayak@socialcops.com'
+AUTHOR_EMAIL = 'vmehta94@gmail.com'
 LICENSE = 'BSD License'

 opencv_min_version = '2.4.8'
@ -58,18 +58,14 @@ def setup_package():

    opencv_status = get_opencv_status()
    opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
-    instructions = ("Installation instructions are available in the README at "
-                    "https://github.com/socialcopsdev/camelot")

    if opencv_status['up_to_date'] is False:
        if opencv_status['version']:
-            raise ImportError("Your installation of OpenCV "
-                              "{0} is out-of-date.\n{1}{2}"
-                              .format(opencv_status['version'],
-                                      opencv_req_str, instructions))
+            raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
+                              .format(opencv_status['version'], opencv_req_str))
        else:
-            raise ImportError("OpenCV is not installed.\n{0}{1}"
-                              .format(opencv_req_str, instructions))
+            raise ImportError("OpenCV is not installed.\n{}"
+                              .format(opencv_req_str))

    setup(**metadata)