Add docstrings and update docs

2018-09-09 10:00:22 +05:30 · 2018-09-09 10:00:22 +05:30 · 9878de4dfc
parent 16c6b8d45d
commit 9878de4dfc
16 changed files with 997 additions and 421 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,5 @@ dist/
 .coverage
 .pytest_cache/
 _build/
 _static/
--- a/README.md
+++ b/README.md
@ -23,50 +23,9 @@ Camelot is a Python 2.7 library and command-line tool for extracting tabular dat
 >>> df = tables[0].df
 </pre>
 Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
 <pre>
 Camelot: PDF parsing made simpler!
 usage:
 camelot [options] &lt;method&gt; [&lt;args&gt;...]
 options:
 -h, --help                Show this screen.
 -v, --version             Show version.
 -V, --verbose             Verbose.
 -p, --pages &lt;pageno&gt;      Comma-separated list of page numbers.
                           Example: -p 1,3-6,10  [default: 1]
 -P, --parallel            Parallelize the parsing process.
 -f, --format &lt;format&gt;     Output format. (csv,tsv,html,json,xlsx) [default: csv]
 -l, --log                 Log to file.
 -o, --output &lt;directory&gt;  Output directory.
 -M, --cmargin &lt;cmargin&gt;   Char margin. Chars closer than cmargin are
                           grouped together to form a word. [default: 2.0]
 -L, --lmargin &lt;lmargin&gt;   Line margin. Lines closer than lmargin are
                           grouped together to form a textbox. [default: 0.5]
 -W, --wmargin &lt;wmargin&gt;   Word margin. Insert blank spaces between chars
                           if distance between words is greater than word
                           margin. [default: 0.1]
 -J, --split_text          Split text lines if they span across multiple cells.
 -K, --flag_size           Flag substring if its size differs from the whole string.
                           Useful for super and subscripts.
 -X, --print-stats         List stats on the parsing process.
 -Y, --save-stats          Save stats to a file.
 -Z, --plot &lt;dist&gt;         Plot distributions. (page,all,rc)
 camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
 See 'camelot &lt;method&gt; -h' for more information on a specific method.
 </pre>
 ## Dependencies
-Currently, camelot works under Python 2.7.
+The dependencies include [tk](https://wiki.tcl.tk/3743) and [ghostscript](https://www.ghostscript.com/).
 The required dependencies include [numpy](http://www.numpy.org/), [OpenCV](http://opencv.org/) and [ghostscript](https://www.ghostscript.com/).
 ## Installation
@ -78,22 +37,22 @@ pip install -U pip setuptools
 ### Installing dependencies
-numpy can be install using `pip`. OpenCV and ghostscript can be installed using your system's default package manager.
+tk and ghostscript can be installed using your system's default package manager.
 #### Linux
 * Arch Linux
 <pre>
 sudo pacman -S opencv tk ghostscript
 </pre>
 * Ubuntu
 <pre>
 sudo apt-get install python-opencv python-tk ghostscript
 </pre>
 * Arch Linux
 <pre>
 sudo pacman -S opencv tk ghostscript
 </pre>
 #### OS X
 <pre>
@ -103,7 +62,7 @@ brew install homebrew/science/opencv ghostscript
 Finally, `cd` into the project directory and install by
 <pre>
-make install
+python setup.py install
 </pre>
 ## Development
@ -118,14 +77,14 @@ git clone https://github.com/socialcopsdev/camelot.git
 ### Contributing
-See [Contributing doc]().
+See [Contributing guidelines]().
 ### Testing
 <pre>
-make test
+python setup.py test
 </pre>
 ## License
-BSD License
+BSD License
--- a/camelot/core.py
+++ b/camelot/core.py
@ -8,9 +8,48 @@ import pandas as pd
 class Cell(object):
-    """
+    """Defines a cell in a table with coordinates relative to a
    left-bottom origin. (pdf coordinate space)
    Parameters
    ----------
    x1 : float
        x-coordinate of left-bottom point.
    y1 : float
        y-coordinate of left-bottom point.
    x2 : float
        x-coordinate of right-top point.
    y2 : float
        y-coordinate of right-top point.
    Attributes
    ----------
    lb : tuple
        Tuple representing left-bottom coordinates.
    lt : tuple
        Tuple representing left-top coordinates.
    rb : tuple
        Tuple representing right-bottom coordinates.
    rt : tuple
        Tuple representing right-top coordinates.
    left : bool
        Whether or not cell is bounded on the left.
    right : bool
        Whether or not cell is bounded on the right.
    top : bool
        Whether or not cell is bounded on the top.
    bottom : bool
        Whether or not cell is bounded on the bottom.
    hspan : bool
        Whether or not cell spans horizontally.
    vspan : bool
        Whether or not cell spans vertically.
    text : string
        Text assigned to cell.
    bound
    """
    def __init__(self, x1, y1, x2, y2):
        self.x1 = x1
        self.y1 = y1
@ -34,37 +73,48 @@ class Cell(object):
    @property
    def text(self):
        """
        Returns
        -------
        """
        return self._text
    @text.setter
    def text(self, t):
        """
        Parameters
        ----------
        t
        """
        self._text = ''.join([self._text, t])
    @property
    def bound(self):
-        """
+        """The number of sides on which the cell is bounded.
        Returns
        -------
        """
        return self.top + self.bottom + self.left + self.right
 class Table(object):
-    """
+    """Defines a table with coordinates relative to a left-bottom
    origin. (pdf coordinate space)
    Parameters
    ----------
    cols : list
        List of tuples representing column x-coordinates in increasing
        order.
    rows : list
        List of tuples representing row y-coordinates in decreasing
        order.
    Attributes
    ----------
    df : object
        pandas.DataFrame
    shape : tuple
        Shape of the table.
    accuracy : float
        Accuracy with which text was assigned to the cell.
    whitespace : float
        Percentage of whitespace in the table.
    order : int
        Table number on pdf page.
    page : int
        Pdf page number.
    data
    parsing_report
    """
    def __init__(self, cols, rows):
@ -84,11 +134,7 @@ class Table(object):
    @property
    def data(self):
-        """
+        """Returns two-dimensional list of strings in table.
        Returns
        -------
        """
        d = []
        for row in self.cells:
@ -97,11 +143,8 @@ class Table(object):
    @property
    def parsing_report(self):
-        """
+        """Returns a parsing report with accuracy, %whitespace,
-
+        table number on page and page number.
        Returns
        -------
        """
        # pretty?
        report = {
@ -112,27 +155,8 @@ class Table(object):
        }
        return report
    def set_border(self):
        """
        Returns
        -------
        """
        for r in range(len(self.rows)):
            self.cells[r][0].left = True
            self.cells[r][len(self.cols) - 1].right = True
        for c in range(len(self.cols)):
            self.cells[0][c].top = True
            self.cells[len(self.rows) - 1][c].bottom = True
        return self
    def set_all_edges(self):
-        """
+        """Sets all table edges to True.
        Returns
        -------
        """
        for row in self.cells:
            for cell in row:
@ -140,16 +164,16 @@ class Table(object):
        return self
    def set_edges(self, vertical, horizontal, joint_close_tol=2):
-        """
+        """Sets a cell's edges to True depending on whether the cell's
        coordinates overlap with the line's coordinates within a
        tolerance.
        Parameters
        ----------
-        vertical
+        vertical : list
-        horizontal
+            List of detected vertical lines.
-        joint_close_tol
+        horizontal : list
-
+            List of detected horizontal lines.
        Returns
        -------
        """
        for v in vertical:
@ -256,12 +280,20 @@ class Table(object):
        return self
-    def set_span(self):
+    def set_border(self):
        """Sets table border edges to True.
        """
        for r in range(len(self.rows)):
            self.cells[r][0].left = True
            self.cells[r][len(self.cols) - 1].right = True
        for c in range(len(self.cols)):
            self.cells[0][c].top = True
            self.cells[len(self.rows) - 1][c].bottom = True
        return self
-        Returns
+    def set_span(self):
-        -------
+        """Sets a cell's hspan or vspan attribute to True depending
-
+        on whether the cell spans horizontally or vertically.
        """
        for row in self.cells:
            for cell in row:
@ -288,6 +320,8 @@ class Table(object):
        return self
    def to_csv(self, path, **kwargs):
        """Write Table to a comma-separated values (csv) file.
        """
        kw = {
            'encoding': 'utf-8',
            'index': False,
@ -297,6 +331,8 @@ class Table(object):
        self.df.to_csv(path, **kw)
    def to_json(self, path, **kwargs):
        """Write Table to a JSON file.
        """
        kw = {
            'orient': 'records'
        }
@ -306,6 +342,8 @@ class Table(object):
            f.write(json_string)
    def to_excel(self, path, **kwargs):
        """Write Table to an Excel file.
        """
        kw = {
            'sheet_name': 'page-{}-table-{}'.format(self.page, self.order),
            'encoding': 'utf-8'
@ -316,13 +354,21 @@ class Table(object):
        writer.save()
    def to_html(self, path, **kwargs):
        """Write Table to an HTML file.
        """
        html_string = self.df.to_html(**kwargs)
        with open(path, 'w') as f:
            f.write(html_string)
 class TableList(object):
-    """
+    """Defines a list of camelot.core.Table objects. Each table can
    be accessed using its index.
    Attributes
    ----------
    n : int
        Number of tables in the list.
    """
    def __init__(self, tables):
@ -371,6 +417,18 @@ class TableList(object):
                z.write(filepath, os.path.basename(filepath))
    def export(self, path, f='csv', compress=False):
        """Exports the list of tables to specified file format.
        Parameters
        ----------
        path : str
            Filepath
        f : str
            File format. Can be csv, json, excel and html.
        compress : bool
            Whether or not to add files to a ZIP archive.
        """
        dirname = os.path.dirname(path)
        basename = os.path.basename(path)
        root, ext = os.path.splitext(basename)
@ -402,9 +460,6 @@ class TableList(object):
 class Geometry(object):
    """
    """
    def __init__(self):
        self.text = []
        self.images = ()
@ -421,9 +476,6 @@ class Geometry(object):
 class GeometryList(object):
    """
    """
    def __init__(self, geometry):
        self.text = [g.text for g in geometry]
        self.images = [g.images for g in geometry]
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -9,18 +9,43 @@ from .utils import get_page_layout, get_text_objects, get_rotation
 class PDFHandler(object):
-    """
+    """Handles all operations like temp directory creation, splitting
    file into single page pdfs, parsing each pdf and then removing the
    temp directory.
    Parameter
    ---------
    filename : str
        Path to pdf file.
    pages : str
        Comma-separated page numbers to parse.
        Example: 1,3,4 or 1,4-end
    """
    def __init__(self, filename, pages='1'):
        self.filename = filename
        if not self.filename.endswith('.pdf'):
            raise TypeError("File format not supported.")
-        self.pages = self.__get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filename, pages)
        self.tempdir = tempfile.mkdtemp()
-    def __get_pages(self, filename, pages):
+    def _get_pages(self, filename, pages):
-        # refactor
+        """Converts pages string to list of ints.
        Parameters
        ----------
        filename : str
            Path to pdf file.
        pages : str
            Comma-separated page numbers to parse.
            Example: 1,3,4 or 1,4-end
        Returns
        -------
        P : list
            List of int page numbers.
        """
        page_numbers = []
        if pages == '1':
            page_numbers.append({'start': 1, 'end': 1})
@ -42,8 +67,19 @@ class PDFHandler(object):
            P.extend(range(p['start'], p['end'] + 1))
        return sorted(set(P))
-    def __save_page(self, filename, page, temp):
+    def _save_page(self, filename, page, temp):
-        # refactor
+        """Saves specified page from pdf into a temporary directory.
        Parameters
        ----------
        filename : str
            Path to pdf file.
        page : int
            Page number
        temp : str
            Tmp directory
        """
        with open(filename, 'rb') as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
@ -65,28 +101,37 @@ class PDFHandler(object):
                infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
                outfile = PdfFileWriter()
                p = infile.getPage(0)
-                if rotation == 'left':
+                if rotation == 'anticlockwise':
                    p.rotateClockwise(90)
-                elif rotation == 'right':
+                elif rotation == 'clockwise':
                    p.rotateCounterClockwise(90)
                outfile.addPage(p)
                with open(fpath, 'wb') as f:
                    outfile.write(f)
    def parse(self, mesh=False, **kwargs):
-        """
+        """Extracts tables by calling parser.get_tables on all single
        page pdfs.
        Parameters
        ----------
-        mesh
+        mesh : bool (default: False)
-        kwargs
+            Whether or not to use Lattice method of parsing. Stream
            is used by default.
        kwargs : dict
            See camelot.read_pdf kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in pdf.
        geometry : camelot.core.GeometryList
            List of geometry objects (contours, lines, joints)
            found in pdf.
        """
        for p in self.pages:
-            self.__save_page(self.filename, p, self.tempdir)
+            self._save_page(self.filename, p, self.tempdir)
        pages = [os.path.join(self.tempdir, 'page-{0}.pdf'.format(p))
                 for p in self.pages]
        tables = []
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -9,17 +9,31 @@ from .utils import merge_tuples
 def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
-    """
+    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
    ----------
-    imagename
+    imagename : string
-    process_background
+        Path to image file.
-    blocksize
+    process_background : bool, optional (default: False)
-    c
+        Whether or not to process lines that are in background.
    blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    Returns
    -------
    img : object
        numpy.ndarray representing the original image.
    threshold : object
        numpy.ndarray representing the thresholded image.
    """
    img = cv2.imread(imagename)
@ -35,17 +49,35 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
 def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
-    """
+    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
    Parameters
    ----------
-    threshold
+    threshold : object
-    direction
+        numpy.ndarray representing the thresholded image.
-    line_size_scaling
+    direction : string, optional (default: 'horizontal')
-    iterations
+        Specifies whether to find vertical or horizontal lines.
    line_size_scaling : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    Returns
    -------
    dmask : object
        numpy.ndarray representing pixels where vertical/horizontal
        lines lie.
    lines : list
        List of tuples representing vertical/horizontal lines with
        coordinates relative to a left-top origin in
        image coordinate space.
    """
    lines = []
@ -84,15 +116,21 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
 def find_table_contours(vertical, horizontal):
-    """
+    """Finds table boundaries using OpenCV's findContours.
    Parameters
    ----------
-    vertical
+    vertical : object
-    horizontal
+        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    cont : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in image coordinate space.
    """
    mask = vertical + horizontal
@ -114,16 +152,26 @@ def find_table_contours(vertical, horizontal):
 def find_table_joints(contours, vertical, horizontal):
-    """
+    """Finds joints/intersections present inside each table boundary.
    Parameters
    ----------
-    contours
+    contours : list
-    vertical
+        List of tuples representing table boundaries. Each tuple is of
-    horizontal
+        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in image coordinate space.
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as their value.
        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
        and (x2, y2) -> rt in image coordinate space.
    """
    joints = np.bitwise_and(vertical, horizontal)
@ -150,15 +198,24 @@ def find_table_joints(contours, vertical, horizontal):
 def remove_lines(threshold, line_size_scaling=15):
-    """
+    """Removes lines from a thresholded image.
    Parameters
    ----------
-    threshold
+    threshold : object
-    line_size_scaling
+        numpy.ndarray representing the thresholded image.
    line_size_scaling : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    Returns
    -------
    threshold : object
        numpy.ndarray representing the thresholded image
        with horizontal and vertical lines removed.
    """
    size = threshold.shape[0] // line_size_scaling
@ -178,16 +235,23 @@ def remove_lines(threshold, line_size_scaling=15):
 def find_cuts(threshold, char_size_scaling=200):
-    """
+    """Finds cuts made by text projections on y-axis.
    Parameters
    ----------
-    threshold
+    threshold : object
-    char_size_scaling
+        numpy.ndarray representing the thresholded image.
    line_size_scaling : int, optional (default: 200)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    Returns
    -------
-
+    y_cuts : list
        List of cuts on y-axis.
    """
    size = threshold.shape[0] // char_size_scaling
    char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
--- a/camelot/io.py
+++ b/camelot/io.py
@ -2,20 +2,93 @@ from .handlers import PDFHandler
 def read_pdf(filepath, pages='1', mesh=False, **kwargs):
-    """
+    """Read PDF and return parsed data tables.
    Note: kwargs annotated with ^ can only be used with mesh=False
    and kwargs annotated with * can only be used with mesh=True.
    Parameters
    ----------
-    filepath
+    filepath : str
-    pages
+        Path to pdf file.
-    mesh
+    pages : str
-    kwargs
+        Comma-separated page numbers to parse.
        Example: 1,3,4 or 1,4-end
    mesh : bool (default: False)
        Whether or not to use Lattice method of parsing. Stream
        is used by default.
    table_area : list, optional (default: None)
        List of table areas to analyze as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    columns^ : list, optional (default: None)
        List of column x-coordinates as strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    row_close_tol^ : int, optional (default: 2)
        Rows will be formed by combining text vertically
        within this tolerance.
    col_close_tol^ : int, optional (default: 0)
        Columns will be formed by combining text horizontally
        within this tolerance.
    process_background* : bool, optional (default: False)
        Whether or not to process lines that are in background.
    line_size_scaling* : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    copy_text* : list, optional (default: None)
        {'h', 'v'}
        Select one or more strings from above and pass them as a list
        to specify the direction in which text should be copied over
        when a cell spans multiple rows or columns.
    shift_text* : list, optional (default: ['l', 't'])
        {'l', 'r', 't', 'b'}
        Select one or more strings from above and pass them as a list
        to specify where the text in a spanning cell should flow.
    line_close_tol* : int, optional (default: 2)
        Tolerance parameter used to merge vertical and horizontal
        detected lines which lie close to each other.
    joint_close_tol* : int, optional (default: 2)
        Tolerance parameter used to decide whether the detected lines
        and points lie close to each other.
    threshold_blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    margins : tuple
        PDFMiner margins. (char_margin, line_margin, word_margin)
        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
    debug : bool, optional (default: False)
        Whether or not to return all text objects on the page
        which can be used to generate a matplotlib plot, to get
        values for table_area(s) and debugging.
    Returns
    -------
    tables : camelot.core.TableList
    """
-    # explicit type conversion
+    # validate kwargs?
    p = PDFHandler(filepath, pages)
    tables, __ = p.parse(mesh=mesh, **kwargs)
    return tables
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -5,8 +5,7 @@ from ..utils import get_page_layout, get_text_objects
 class BaseParser(object):
-    """
+    """Defines a base parser.
    """
    def _generate_layout(self, filename):
        self.filename = filename
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -11,7 +11,7 @@ from .base import BaseParser
 from ..core import Table
 from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                     merge_close_lines, get_table_index, compute_accuracy,
-                     count_empty_strings, encode_, setup_logging)
+                     compute_whitespace, setup_logging, encode_)
 from ..image_processing import (adaptive_threshold, find_lines,
                                find_table_contours, find_table_joints)
@ -20,14 +20,74 @@ logger = setup_logging(__name__)
 class Lattice(BaseParser):
-    """
+    """Lattice method of parsing looks for lines between text
    to form a table.
    Parameters
    ----------
    table_area : list, optional (default: None)
        List of table areas to analyze as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    process_background : bool, optional (default: False)
        Whether or not to process lines that are in background.
    line_size_scaling : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    copy_text : list, optional (default: None)
        {'h', 'v'}
        Select one or more strings from above and pass them as a list
        to specify the direction in which text should be copied over
        when a cell spans multiple rows or columns.
    shift_text : list, optional (default: ['l', 't'])
        {'l', 'r', 't', 'b'}
        Select one or more strings from above and pass them as a list
        to specify where the text in a spanning cell should flow.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    line_close_tol : int, optional (default: 2)
        Tolerance parameter used to merge vertical and horizontal
        detected lines which lie close to each other.
    joint_close_tol : int, optional (default: 2)
        Tolerance parameter used to decide whether the detected lines
        and points lie close to each other.
    threshold_blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    margins : tuple
        PDFMiner margins. (char_margin, line_margin, word_margin)
        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
    debug : bool, optional (default: False)
        Whether or not to return all text objects on the page
        which can be used to generate a matplotlib plot, to get
        values for table_area(s) and debugging.
    """
    def __init__(self, table_area=None, process_background=False,
                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, line_close_tol=2,
                 joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
-                 iterations=0, margins=(1.0, 0.5, 0.1), debug=None):
+                 iterations=0, margins=(1.0, 0.5, 0.1), debug=False):
        self.table_area = table_area
        self.process_background = process_background
        self.line_size_scaling = line_size_scaling
@ -45,6 +105,27 @@ class Lattice(BaseParser):
    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
        cell.
        Parameters
        ----------
        table : camelot.core.Table
        idx : list
            List of tuples of the form (r_idx, c_idx, text).
        shift_text : list
            {'l', 'r', 't', 'b'}
            Select one or more strings from above and pass them as a
            list to specify where the text in a spanning cell should
            flow.
        Returns
        -------
        indices : list
            List of tuples of the form (r_idx, c_idx, text) where
            r_idx and c_idx are new row and column indices for text.
        """
        indices = []
        for r_idx, c_idx, text in idx:
            for d in shift_text:
@ -69,6 +150,22 @@ class Lattice(BaseParser):
    @staticmethod
    def _copy_spanning_text(t, copy_text=None):
        """Copies over text in empty spanning cells.
        Parameters
        ----------
        t : camelot.core.Table
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.
        Returns
        -------
        t : camelot.core.Table
        """
        for f in copy_text:
            if f == "h":
                for i in range(len(t.cells)):
@ -199,7 +296,7 @@ class Lattice(BaseParser):
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
-        whitespace, __, __ = count_empty_strings(data)
+        whitespace = compute_whitespace(data)
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
@ -208,16 +305,6 @@ class Lattice(BaseParser):
        return table
    def extract_tables(self, filename):
        """
        Parameters
        ----------
        filename
        Returns
        -------
        """
        logger.info('Processing {}'.format(os.path.basename(filename)))
        self._generate_layout(filename)
@ -237,7 +324,7 @@ class Lattice(BaseParser):
            table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
            _tables.append(table)
-        if self.debug is not None:
+        if self.debug:
            text = []
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -8,19 +8,54 @@ import pandas as pd
 from .base import BaseParser
 from ..core import Table
 from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
-                     count_empty_strings, encode_, setup_logging)
+                     compute_whitespace, setup_logging, encode_)
 logger = setup_logging(__name__)
 class Stream(BaseParser):
-    """
+    """Stream method of parsing looks for spaces between text
    to form a table.
    If you want to specify columns when specifying multiple table
    areas, make sure that the length of both lists are equal.
    Parameters
    ----------
    table_area : list, optional (default: None)
        List of table areas to analyze as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    columns : list, optional (default: None)
        List of column x-coordinates as strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    row_close_tol : int, optional (default: 2)
        Rows will be formed by combining text vertically
        within this tolerance.
    col_close_tol : int, optional (default: 0)
        Columns will be formed by combining text horizontally
        within this tolerance.
    margins : tuple, optional (default: (1.0, 0.5, 0.1))
        PDFMiner margins. (char_margin, line_margin, word_margin)
        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
    debug : bool, optional (default: False)
        Whether or not to return all text objects on the page
        which can be used to generate a matplotlib plot, to get
        values for table_area(s), columns and debugging.
    """
    def __init__(self, table_area=None, columns=None, split_text=False,
                 flag_size=False, row_close_tol=2, col_close_tol=0,
-                 margins=(1.0, 0.5, 0.1), debug=None):
+                 margins=(1.0, 0.5, 0.1), debug=False):
        self.table_area = table_area
        self.columns = columns
        self._validate_columns()
@ -33,6 +68,20 @@ class Stream(BaseParser):
    @staticmethod
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.
        Returns
        -------
        text_bbox : tuple
            Tuple (x0, y0, x1, y1) in pdf coordinate space.
        """
        xmin = min([t.x0 for direction in t_bbox for t in t_bbox[direction]])
        ymin = min([t.y0 for direction in t_bbox for t in t_bbox[direction]])
        xmax = max([t.x1 for direction in t_bbox for t in t_bbox[direction]])
@ -42,6 +91,21 @@ class Stream(BaseParser):
    @staticmethod
    def _group_rows(text, row_close_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_close_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = 0
        rows = []
        temp = []
@ -61,6 +125,21 @@ class Stream(BaseParser):
    @staticmethod
    def _merge_columns(l, col_close_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        col_close_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
@ -89,6 +168,21 @@ class Stream(BaseParser):
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                    if len(r) > 0 else 0 for r in rows_grouped]
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
@ -100,6 +194,23 @@ class Stream(BaseParser):
    @staticmethod
    def _add_columns(cols, text, row_close_tol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = Stream._group_rows(text, row_close_tol=row_close_tol)
            elements = [len(r) for r in text]
@ -110,6 +221,21 @@ class Stream(BaseParser):
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
@ -207,7 +333,7 @@ class Stream(BaseParser):
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
-        whitespace, __, __ = count_empty_strings(data)
+        whitespace = compute_whitespace(data)
        table.accuracy = accuracy
        table.whitespace = whitespace
        table.order = table_idx + 1
@ -216,16 +342,6 @@ class Stream(BaseParser):
        return table
    def extract_tables(self, filename):
        """
        Parameters
        ----------
        filename
        Returns
        -------
        """
        logger.info('Processing {}'.format(os.path.basename(filename)))
        self._generate_layout(filename)
@ -244,7 +360,7 @@ class Stream(BaseParser):
            table = self._generate_table(table_idx, cols, rows)
            _tables.append(table)
-        if self.debug is not None:
+        if self.debug:
            text = []
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
            text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -6,19 +6,101 @@ from .handlers import PDFHandler
 def plot_geometry(filepath, pages='1', mesh=False, geometry_type='text', **kwargs):
-    """
+    """Plot geometry found on pdf page based on type specified,
    useful for debugging and playing with different parameters to get
    the best output.
    Note: kwargs annotated with ^ can only be used with mesh=False
    and kwargs annotated with * can only be used with mesh=True.
    Parameters
    ----------
-    filepath
+    filepath : str
-    pages
+        Path to pdf file.
-    mesh
+    pages : str
-    geometry_type
+        Comma-separated page numbers to parse.
-    kwargs
+        Example: 1,3,4 or 1,4-end
    mesh : bool (default: False)
        Whether or not to use Lattice method of parsing. Stream
        is used by default.
    geometry_type : str, optional (default: 'text')
        'text' : Plot text objects found on page, useful to get
                 table_area and columns coordinates.
        'table' : Plot parsed table.
        'contour'* : Plot detected rectangles.
        'joint'* : Plot detected line intersections.
        'line'* : Plot detected lines.
    table_area : list, optional (default: None)
        List of table areas to analyze as strings of the form
        x1,y1,x2,y2 where (x1, y1) -> left-top and
        (x2, y2) -> right-bottom in pdf coordinate space.
    columns^ : list, optional (default: None)
        List of column x-coordinates as strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    row_close_tol^ : int, optional (default: 2)
        Rows will be formed by combining text vertically
        within this tolerance.
    col_close_tol^ : int, optional (default: 0)
        Columns will be formed by combining text horizontally
        within this tolerance.
    process_background* : bool, optional (default: False)
        Whether or not to process lines that are in background.
    line_size_scaling* : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    copy_text* : list, optional (default: None)
        {'h', 'v'}
        Select one or more strings from above and pass them as a list
        to specify the direction in which text should be copied over
        when a cell spans multiple rows or columns.
    shift_text* : list, optional (default: ['l', 't'])
        {'l', 'r', 't', 'b'}
        Select one or more strings from above and pass them as a list
        to specify where the text in a spanning cell should flow.
    line_close_tol* : int, optional (default: 2)
        Tolerance parameter used to merge vertical and horizontal
        detected lines which lie close to each other.
    joint_close_tol* : int, optional (default: 2)
        Tolerance parameter used to decide whether the detected lines
        and points lie close to each other.
    threshold_blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    margins : tuple
        PDFMiner margins. (char_margin, line_margin, word_margin)
        For for information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.
    debug : bool, optional (default: False)
        Whether or not to return all text objects on the page
        which can be used to generate a matplotlib plot, to get
        values for table_area(s) and debugging.
    """
-    # explicit type conversion
+    # validate kwargs?
    p = PDFHandler(filepath, pages)
-    kwargs.update({'debug': geometry_type})
+    debug = True if geometry_type else False
    kwargs.update({'debug': debug})
    __, geometry = p.parse(mesh=mesh, **kwargs)
    if geometry_type == 'text':
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -19,14 +19,15 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
 def setup_logging(name):
-    """
+    """Sets up a logger with StreamHandler.
    Parameters
    ----------
-    name
+    name : str
    Returns
    -------
    logger : logging.Logger
    """
    logger = logging.getLogger(name)
@ -47,15 +48,16 @@ logger = setup_logging(__name__)
 def translate(x1, x2):
-    """
+    """Translates x2 by x1.
    Parameters
    ----------
-    x1
+    x1 : float
-    x2
+    x2 : float
    Returns
    -------
    x2 : float
    """
    x2 += x1
@ -63,15 +65,16 @@ def translate(x1, x2):
 def scale(x, s):
-    """
+    """Scales x by scaling factor s.
    Parameters
    ----------
-    x
+    x : float
-    s
+    s : float
    Returns
    -------
    x : float
    """
    x *= s
@ -79,18 +82,21 @@ def scale(x, s):
 def rotate(x1, y1, x2, y2, angle):
-    """
+    """Rotates point x2, y2 about point x1, y1 by angle.
    Parameters
    ----------
-    x1
+    x1 : float
-    y1
+    y1 : float
-    x2
+    x2 : float
-    y2
+    y2 : float
-    angle
+    angle : float
        Angle in radians.
    Returns
    -------
    xnew : float
    ynew : float
    """
    s = np.sin(angle)
@ -105,15 +111,26 @@ def rotate(x1, y1, x2, y2, angle):
 def scale_pdf(k, factors):
-    """
+    """Translates and scales pdf coordinate space to image
    coordinate space.
    Parameters
    ----------
-    k
+    k : tuple
-    factors
+        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
        space.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
        first two elements are scaling factors and pdf_y is height of
        pdf.
    Returns
    -------
    knew : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
        space.
    """
    x1, y1, x2, y2 = k
@ -127,17 +144,28 @@ def scale_pdf(k, factors):
 def scale_image(tables, v_segments, h_segments, factors):
-    """
+    """Translates and scales image coordinate space to pdf
    coordinate space.
    Parameters
    ----------
-    tables
+    tables : dict
-    v_segments
+        Dict with table boundaries as keys and list of intersections
-    h_segments
+        in that boundary as value.
-    factors
+    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of horizontal line segments.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
        first two elements are scaling factors and img_y is height of
        image.
    Returns
    -------
    tables_new : dict
    v_segments_new : dict
    h_segments_new : dict
    """
    scaling_factor_x, scaling_factor_y, img_y = factors
@ -172,16 +200,23 @@ def scale_image(tables, v_segments, h_segments, factors):
 def get_rotation(lttextlh, lttextlv, ltchar):
-    """
+    """Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.
    Parameters
    ----------
-    lttextlh
+    lttextlh : list
-    lttextlv
+        List of PDFMiner LTTextLineHorizontal objects.
-    ltchar
+    lttextlv : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.
    Returns
    -------
    rotation : string
        '' if text in table is upright, 'left' if rotated 90 degree
        anticlockwise and 'right' if rotated 90 degree clockwise.
    """
    rotation = ''
@ -190,21 +225,30 @@ def get_rotation(lttextlh, lttextlv, ltchar):
    if hlen < vlen:
        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
-        rotation = 'left' if clockwise < anticlockwise else 'right'
+        rotation = 'clockwise' if clockwise < anticlockwise else 'anticlockwise'
    return rotation
 def segments_in_bbox(bbox, v_segments, h_segments):
-    """
+    """Returns all line segments present inside a bounding box.
    Parameters
    ----------
-    bbox
+    bbox : tuple
-    v_segments
+        Tuple (x1, y1, x2, y2) representing a bounding box where
-    h_segments
+        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
        space.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of vertical horizontal segments.
    Returns
    -------
    v_s : list
        List of vertical line segments that lie inside table.
    h_s : list
        List of horizontal line segments that lie inside table.
    """
    lb = (bbox[0], bbox[1])
@ -217,35 +261,42 @@ def segments_in_bbox(bbox, v_segments, h_segments):
 def text_in_bbox(bbox, text):
-    """
+    """Returns all text objects present inside a bounding box.
    Parameters
    ----------
-    bbox
+    bbox : tuple
-    text
+        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
        space.
    text : List of PDFMiner text objects.
    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects that lie inside table.
    """
    lb = (bbox[0], bbox[1])
    rt = (bbox[2], bbox[3])
    t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
-                 <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
+                <= rt[0] + 2 and lb[1] - 2 <= (t.y0 + t.y1) / 2.0
-                 <= rt[1] + 2]
+                <= rt[1] + 2]
    return t_bbox
 def remove_close_lines(ar, line_close_tol=2):
-    """
+    """Removes lines which are within a tolerance, based on their x or
    y axis projections.
    Parameters
    ----------
-    ar
+    ar : list
-    line_close_tol
+    line_close_tol : int, optional (default: 2)
    Returns
    -------
    ret : list
    """
    ret = []
@ -262,15 +313,17 @@ def remove_close_lines(ar, line_close_tol=2):
 def merge_close_lines(ar, line_close_tol=2):
-    """
+    """Merges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.
    Parameters
    ----------
-    ar
+    ar : list
-    line_close_tol
+    line_close_tol : int, optional (default: 2)
    Returns
    -------
    ret : list
    """
    ret = []
@ -288,15 +341,19 @@ def merge_close_lines(ar, line_close_tol=2):
 def flag_font_size(textline, direction):
-    """
+    """Flags super/subscripts in text by enclosing them with <s></s>.
    May give false positives.
    Parameters
    ----------
-    textline
+    textline : list
-    direction
+        List of PDFMiner LTChar objects.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    Returns
    -------
    fstring : string
    """
    if direction == 'horizontal':
@ -324,18 +381,27 @@ def flag_font_size(textline, direction):
    return fstring
-def split_textline(table, textline, direction, flag_size=True):
+def split_textline(table, textline, direction, flag_size=False):
-    """
+    """Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.
    Parameters
    ----------
-    table
+    table : camelot.core.Table
-    textline
+    textline : object
-    direction
+        PDFMiner LTTextLine object.
-    flag_size
+    direction : string
        Direction of the PDFMiner LTTextLine object.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    Returns
    -------
    grouped_chars : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.
    """
    idx = 0
@ -388,19 +454,38 @@ def split_textline(table, textline, direction, flag_size=True):
    return grouped_chars
-def get_table_index(table, t, direction, split_text=False, flag_size=True):
+def get_table_index(table, t, direction, split_text=False, flag_size=False):
-    """
+    """Gets indices of the table cell where given text object lies by
    comparing their y and x-coordinates.
    Parameters
    ----------
-    table
+    table : camelot.core.Table
-    t
+    t : object
-    direction
+        PDFMiner LTTextLine object.
-    split_text
+    direction : string
-    flag_size
+        Direction of the PDFMiner LTTextLine object.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
    Returns
    -------
    indices : list
        List of tuples of the form (r_idx, c_idx, text) where r_idx
        and c_idx are row and column indices.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+
    """
    r_idx, c_idx = [-1] * 2
@ -450,14 +535,19 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
 def compute_accuracy(error_weights):
-    """
+    """Calculates a score based on weights assigned to various
    parameters and their error percentages.
    Parameters
    ----------
-    error_weights
+    error_weights : list
        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
        where pn is the weight assigned to list of errors en.
        Sum of pn should be equal to 100.
    Returns
    -------
    score : float
    """
    SCORE_VAL = 100
@ -474,50 +564,40 @@ def compute_accuracy(error_weights):
    return score
-def count_empty_strings(d):
+def compute_whitespace(d):
-    """
+    """Calculates the percentage of empty strings in a
    two-dimensional list.
    Parameters
    ----------
-    d
+    d : list
    Returns
    -------
    whitespace : float
        Percentage of empty cells.
    """
-    empty_p = 0
+    whitespace = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
        for j in i:
            if j.strip() == '':
-                empty_p += 1
+                whitespace += 1
-    empty_p = 100 * (empty_p / float(len(d) * len(d[0])))
+    whitespace = 100 * (whitespace / float(len(d) * len(d[0])))
-    for row in d:
+    return whitespace
        r_nempty_c = 0
        for r in row:
            if r.strip() != '':
                r_nempty_c += 1
        r_nempty_cells.append(r_nempty_c)
    d = zip(*d)
    d = [list(col) for col in d]
    for col in d:
        c_nempty_c = 0
        for c in col:
            if c.strip() != '':
                c_nempty_c += 1
        c_nempty_cells.append(c_nempty_c)
    return empty_p, r_nempty_cells, c_nempty_cells
-def remove_empty_strings(d):
+def remove_empty(d):
-    """
+    """Removes empty rows and columns from a two-dimensional list.
    Parameters
    ----------
-    d
+    d : list
    Returns
    -------
    d : list
    """
    for i, row in enumerate(d):
@ -530,70 +610,46 @@ def remove_empty_strings(d):
 def encode_(ar):
-    """
+    """Encodes two-dimensional list into unicode.
    Parameters
    ----------
-    ar
+    ar : list
    Returns
    -------
    ar : list
    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar
-def get_text_objects(layout, ltype="char", t=None):
+def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1,
    """
    Parameters
    ----------
    layout
    ltype
    t
    Returns
    -------
    """
    if ltype == "char":
        LTObject = LTChar
    elif ltype == "lh":
        LTObject = LTTextLineHorizontal
    elif ltype == "lv":
        LTObject = LTTextLineVertical
    if t is None:
        t = []
    try:
        for obj in layout._objs:
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
                t += get_text_objects(obj, ltype=ltype)
    except AttributeError:
        pass
    return t
 def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
               detect_vertical=True, all_texts=True):
-    """
+    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
    Parameters
    ----------
-    pname
+    filename : string
-    char_margin
+        Path to pdf file.
-    line_margin
+    char_margin : float
-    word_margin
+    line_margin : float
-    detect_vertical
+    word_margin : float
-    all_texts
+    detect_vertical : bool
    all_texts : bool
    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        Dimension of pdf page in the form (width, height).
    """
-    with open(pname, 'r') as f:
+    with open(filename, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
@ -615,12 +671,56 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
        return layout, dim
-def merge_tuples(tuples):
+def get_text_objects(layout, ltype="char", t=None):
-    """
+    """Recursively parses pdf layout to get a list of
    PDFMiner text objects.
    Parameters
    ----------
-    tuples
+    layout : object
        PDFMiner LTPage object.
    ltype : string
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
    t : list
    Returns
    -------
    t : list
        List of PDFMiner text objects.
    """
    if ltype == "char":
        LTObject = LTChar
    elif ltype == "lh":
        LTObject = LTTextLineHorizontal
    elif ltype == "lv":
        LTObject = LTTextLineVertical
    if t is None:
        t = []
    try:
        for obj in layout._objs:
            if isinstance(obj, LTObject):
                t.append(obj)
            else:
                t += get_text_objects(obj, ltype=ltype)
    except AttributeError:
        pass
    return t
 def merge_tuples(tuples):
    """Merges a list of overlapping tuples.
    Parameters
    ----------
    tuples : list
        List of tuples where a tuple is a single axis coordinate pair.
    Yields
    ------
    tuple
    """
    merged = list(tuples[0])
    for s, e in tuples:
--- a/docs/api.rst
+++ b/docs/api.rst
@ -4,17 +4,37 @@
 API Reference
 =============
-Pdf
+camelot.read_pdf
-===
+================
-.. automodule:: camelot.pdf
+.. automodule:: camelot.read_pdf
   :members:
-Lattice
+camelot.handlers.PDFHandler
-=======
+===========================
-.. automodule:: camelot.lattice
+.. automodule:: camelot.handlers.PDFHandler
   :members:
-Stream
+camelot.parsers.Stream
-======
+======================
-.. automodule:: camelot.stream
+.. automodule:: camelot.parsers.Stream
   :members:
 camelot.parsers.Lattice
 =======================
 .. automodule:: camelot.parsers.Lattice
   :members:
 camelot.core.Cell
 =================
 .. automodule:: camelot.core.Cell
   :members:
 camelot.core.Table
 ==================
 .. automodule:: camelot.core.Table
   :members:
 camelot.core.TableList
 ======================
 .. automodule:: camelot.core.TableList
   :members:
--- a/docs/index.rst
+++ b/docs/index.rst
@ -3,11 +3,11 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
-==================================
+=====================================
-Camelot: pdf parsing made simpler!
+Camelot: PDF Table Parsing for Humans
-==================================
+=====================================
-Camelot is a Python 2.7 library and command-line tool for getting tables out of pdf files.
+Camelot is a Python 2.7 library and command-line tool for extracting tabular data from PDF files.
 Why another pdf table parsing library?
 ======================================
@ -32,12 +32,22 @@ Usage
 ::
-    >>> from camelot.pdf import Pdf
+    >>> import camelot
-    >>> from camelot.lattice import Lattice
+    >>> tables = camelot.read_pdf("foo.pdf")
-
+    >>> tables
-    >>> manager = Pdf(Lattice(), 'us-030.pdf')
+    <TableList n=2>
-    >>> tables = manager.extract()
+    >>> tables.export("foo.csv", f="csv", compress=True) # json, excel, html
-    >>> print tables['page-1']['table-1']['data']
+    >>> tables[0]
    <Table shape=(3,4)>
    >>> tables[0].to_csv("foo.csv") # to_json, to_excel, to_html
    >>> tables[0].parsing_report
    {
        "accuracy": 96,
        "whitespace": 80,
        "order": 1,
        "page": 1
    }
    >>> df = tables[0].df
 .. csv-table::
   :header: "Cycle Name","KI (1/km)","Distance (mi)","Percent Fuel Savings","","",""
@ -49,45 +59,6 @@ Usage
   "2032_2","0.17","57.8","21.7%","0.3%","2.7%","1.2%"
   "4171_1","0.07","173.9","58.1%","1.6%","2.1%","0.5%"
 Camelot comes with a CLI where you can specify page numbers, output format, output directory etc. By default, the output files are placed in the same directory as the PDF.
 ::
    Camelot: PDF parsing made simpler!
    usage:
     camelot [options] <method> [<args>...]
    options:
     -h, --help                Show this screen.
     -v, --version             Show version.
     -V, --verbose             Verbose.
     -p, --pages <pageno>      Comma-separated list of page numbers.
                               Example: -p 1,3-6,10  [default: 1]
     -P, --parallel            Parallelize the parsing process.
     -f, --format <format>     Output format. (csv,tsv,html,json,xlsx) [default: csv]
     -l, --log                 Log to file.
     -o, --output <directory>  Output directory.
     -M, --cmargin <cmargin>   Char margin. Chars closer than cmargin are
                               grouped together to form a word. [default: 1.0]
     -L, --lmargin <lmargin>   Line margin. Lines closer than lmargin are
                               grouped together to form a textbox. [default: 0.5]
     -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
                               if distance between words is greater than word
                               margin. [default: 0.1]
     -J, --split_text          Split text lines if they span across multiple cells.
     -K, --flag_size           Flag substring if its size differs from the whole string.
                               Useful for super and subscripts.
     -X, --print-stats         List stats on the parsing process.
     -Y, --save-stats          Save stats to a file.
     -Z, --plot <dist>         Plot distributions. (page,all,rc)
    camelot methods:
     lattice  Looks for lines between data.
     stream   Looks for spaces between data.
    See 'camelot <method> -h' for more information on a specific method.
 Installation
 ============
@ -95,42 +66,41 @@ Make sure you have the most updated versions for `pip` and `setuptools`. You can
    pip install -U pip setuptools
-The required dependencies include `numpy`_, `OpenCV`_ and `ImageMagick`_.
+The dependencies include `tk`_ and `ghostscript`_.
-.. _numpy: http://www.numpy.org/
+.. _tk: https://wiki.tcl.tk/3743
-.. _OpenCV: http://opencv.org/
+.. _ghostscript: https://www.ghostscript.com/
 .. _ImageMagick: http://www.imagemagick.org/script/index.php
 Installing dependencies
 -----------------------
-numpy can be install using `pip`. OpenCV and imagemagick can be installed using your system's default package manager.
+tk and ghostscript can be installed using your system's default package manager.
 Linux
 ^^^^^
 * Arch Linux
 ::
    sudo pacman -S opencv imagemagick
 * Ubuntu
 ::
-    sudo apt-get install libopencv-dev python-opencv imagemagick
+    sudo apt-get install python-opencv python-tk ghostscript
 * Arch Linux
 ::
    sudo pacman -S opencv tk ghostscript
 OS X
 ^^^^
 ::
-    brew install homebrew/science/opencv imagemagick
+    brew install homebrew/science/opencv ghostscript
 Finally, `cd` into the project directory and install by::
-    make install
+    python setup.py install
 API Reference
 =============
@ -150,14 +120,14 @@ You can check the latest sources with the command::
 Contributing
 ------------
-See :doc:`Contributing doc <contributing>`.
+See :doc:`Contributing guidelines <contributing>`.
 Testing
 -------
 ::
-    make test
+    python setup.py test
 License
 =======
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -0,0 +1,11 @@
 click==6.7
 matplotlib==2.2.3
 numpy==1.13.3
 opencv-python==3.4.2.17
 pandas==0.23.4
 pdfminer==20140328
 Pillow==5.2.0
 PyPDF2==1.26.0
 pytest==3.8.0
 pytest-runner==4.2
 Sphinx==1.8.0b1
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,8 @@
-docopt==0.6.2
+click==6.7
 matplotlib==2.2.3
-nose==1.3.7
+numpy==1.13.3
 opencv-python==3.4.2.17
 pandas==0.23.4
 pdfminer==20140328
 pyexcel-xlsx==0.5.6
 Pillow==5.2.0
-PyPDF2==1.26.0
+PyPDF2==1.26.0
 Sphinx==1.8.0b1
--- a/setup.py
+++ b/setup.py
@ -4,12 +4,12 @@ import camelot
 NAME = 'camelot'
 VERSION = camelot.__version__
-DESCRIPTION = 'camelot parses tables from PDFs!'
+DESCRIPTION = 'PDF Table Parsing for Humans'
 with open('README.md') as f:
    LONG_DESCRIPTION = f.read()
 URL = 'https://github.com/socialcopsdev/camelot'
 AUTHOR = 'Vinayak Mehta'
-AUTHOR_EMAIL = 'vinayak@socialcops.com'
+AUTHOR_EMAIL = 'vmehta94@gmail.com'
 LICENSE = 'BSD License'
 opencv_min_version = '2.4.8'
@ -58,18 +58,14 @@ def setup_package():
    opencv_status = get_opencv_status()
    opencv_req_str = "camelot requires OpenCV >= {0}.\n".format(opencv_min_version)
    instructions = ("Installation instructions are available in the README at "
                    "https://github.com/socialcopsdev/camelot")
    if opencv_status['up_to_date'] is False:
        if opencv_status['version']:
-            raise ImportError("Your installation of OpenCV "
+            raise ImportError("Your installation of OpenCV {} is out-of-date.\n{}"
-                              "{0} is out-of-date.\n{1}{2}"
+                              .format(opencv_status['version'], opencv_req_str))
                              .format(opencv_status['version'],
                                      opencv_req_str, instructions))
        else:
-            raise ImportError("OpenCV is not installed.\n{0}{1}"
+            raise ImportError("OpenCV is not installed.\n{}"
-                              .format(opencv_req_str, instructions))
+                              .format(opencv_req_str))
    setup(**metadata)