diff --git a/README.md b/README.md index 790b2b2..78485aa 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Camelot: PDF Table Parsing for Humans +# Camelot: PDF Table Extraction for Humans ![license](https://img.shields.io/badge/license-MIT-lightgrey.svg) ![python-version](https://img.shields.io/badge/python-2.7-blue.svg) @@ -38,7 +38,7 @@ | 2032_2 | 0.17 | 57.8 | 21.7% | 0.3% | 2.7% | 1.2% | | 4171_1 | 0.07 | 173.9 | 58.1% | 1.6% | 2.1% | 0.5% | -There's a [command-line interface]() too! +There's a [command-line interface](http://camelot-py.readthedocs.io/en/master/user/cli.html) too! ## Why Camelot? @@ -46,13 +46,12 @@ There's a [command-line interface]() too! - **Metrics**: *Bad* tables can be discarded based on metrics like accuracy and whitespace, without ever having to manually look at each table. - Each table is a **pandas DataFrame**, which enables seamless integration into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873). - **Export** to multiple formats, including json, excel and html. -- Simple and Elegant API, written in **Python**! -See [comparison with other PDF parsing libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Parsing-libraries-and-tools). +See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools). ## Installation -After [installing the dependencies](), you can simply use pip to install Camelot: +After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/), you can simply use pip to install Camelot:
 $ pip install camelot-py
@@ -60,7 +59,7 @@ $ pip install camelot-py
 
 ### Alternatively
 
-You can install the dependencies [tk](https://packages.ubuntu.com/trusty/python-tk) and [ghostscript](https://www.ghostscript.com/) using your system's package manager. After that, clone the repo using:
+After [installing the dependencies](http://camelot-py.readthedocs.io/en/master/user/install.html), clone the repo using:
 
 
 $ git clone https://www.github.com/socialcopsdev/camelot
@@ -77,7 +76,7 @@ Note: Use a [virtualenv](https://virtualenv.pypa.io/en/stable/) if you don't wan
 
 ## Documentation
 
-Great documentation is available at [insert link]().
+Great documentation is available at [insert link](http://camelot-py.readthedocs.io/).
 
 ## Development
 
diff --git a/camelot/__init__.py b/camelot/__init__.py
index b762cea..72f362e 100644
--- a/camelot/__init__.py
+++ b/camelot/__init__.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 from .__version__ import __version__
 
 from .io import read_pdf
\ No newline at end of file
diff --git a/camelot/__version__.py b/camelot/__version__.py
index 7341562..485c17e 100644
--- a/camelot/__version__.py
+++ b/camelot/__version__.py
@@ -1,3 +1,11 @@
+# -*- coding: utf-8 -*-
+
 VERSION = (0, 1, 0)
 
+__title__ = 'camelot-py'
+__description__ = 'PDF Table Extraction for Humans.'
+__url__ = 'http://camelot-py.readthedocs.io/'
 __version__ = '.'.join(map(str, VERSION))
+__author__ = 'Vinayak Mehta'
+__author_email__ = 'vmehta94@gmail.com'
+__license__ = 'MIT License'
\ No newline at end of file
diff --git a/camelot/cli.py b/camelot/cli.py
index 02e37d3..af09b24 100644
--- a/camelot/cli.py
+++ b/camelot/cli.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+
 from pprint import pprint
 
 import click
@@ -20,23 +21,22 @@ pass_config = click.make_pass_decorator(Config)
 
 @click.group()
 @click.version_option(version=__version__)
-@click.option('-p', '--pages', default='1', help='Comma-separated page numbers'
-              ' to parse. Example: 1,3,4 or 1,4-end')
-@click.option('-o', '--output', help='Output filepath.')
+@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
+              ' Example: 1,3,4 or 1,4-end.')
+@click.option('-o', '--output', help='Output file path.')
 @click.option('-f', '--format',
               type=click.Choice(['csv', 'json', 'excel', 'html']),
               help='Output file format.')
-@click.option('-z', '--zip', is_flag=True, help='Whether or not to create a ZIP'
-              ' archive.')
-@click.option('-split', '--split_text', is_flag=True, help='Whether or not to'
-              ' split text if it spans across multiple cells.')
-@click.option('-flag', '--flag_size', is_flag=True, help='(inactive) Whether or'
-              ' not to flag text which has uncommon size. (Useful to detect'
-              ' super/subscripts)')
+@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
+@click.option('-split', '--split_text', is_flag=True,
+              help='Split text that spans across multiple cells.')
+@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on'
+              ' font size. Useful to detect super/subscripts.')
 @click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1),
-              help='char_margin, line_margin, word_margin for PDFMiner.')
+              help='PDFMiner char_margin, line_margin and word_margin.')
 @click.pass_context
 def cli(ctx, *args, **kwargs):
+    """Camelot: PDF Table Extraction for Humans"""
     ctx.obj = Config()
     for key, value in kwargs.iteritems():
         ctx.obj.set_config(key, value)
@@ -44,45 +44,42 @@ def cli(ctx, *args, **kwargs):
 
 @cli.command('lattice')
 @click.option('-T', '--table_area', default=[], multiple=True,
-              help='Table areas (x1,y1,x2,y2) to process.\n'
-              ' x1, y1 -> left-top and x2, y2 -> right-bottom')
+              help='Table areas to process. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-back', '--process_background', is_flag=True,
-              help='Whether or not to process lines that are in'
-              ' background.')
+              help='Process background lines.')
 @click.option('-scale', '--line_size_scaling', default=15,
-              help='Factor by which the page dimensions will be'
-              ' divided to get smallest length of detected lines.')
+              help='Line size scaling factor. The larger the value,'
+              ' the smaller the detected lines.')
 @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
-              multiple=True, help='Specify direction'
-              ' in which text will be copied over in a spanning cell.')
+              multiple=True, help='Direction in which text in a spanning cell'
+              ' will be copied over.')
 @click.option('-shift', '--shift_text', default=['l', 't'],
               type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True,
-              help='Specify direction in which text in a spanning'
-              ' cell should flow.')
+              help='Direction in which text in a spanning cell will flow.')
 @click.option('-l', '--line_close_tol', default=2,
               help='Tolerance parameter used to merge close vertical'
-              ' lines and close horizontal lines.')
+              ' and horizontal lines.')
 @click.option('-j', '--joint_close_tol', default=2,
               help='Tolerance parameter used to decide whether'
               ' the detected lines and points lie close to each other.')
 @click.option('-block', '--threshold_blocksize', default=15,
               help='For adaptive thresholding, size of a pixel'
               ' neighborhood that is used to calculate a threshold value for'
-              ' the pixel: 3, 5, 7, and so on.')
+              ' the pixel. Example: 3, 5, 7, and so on.')
 @click.option('-const', '--threshold_constant', default=-2,
               help='For adaptive thresholding, constant subtracted'
-              ' from the mean or weighted mean.\nNormally, it is positive but'
+              ' from the mean or weighted mean. Normally, it is positive but'
               ' may be zero or negative as well.')
 @click.option('-I', '--iterations', default=0,
-              help='Number of times for erosion/dilation is'
-              ' applied.')
+              help='Number of times for erosion/dilation will be applied.')
 @click.option('-plot', '--plot_type',
               type=click.Choice(['text', 'table', 'contour', 'joint', 'line']),
-              help='Plot geometry found on PDF page for debugging.')
+              help='Plot geometry found on PDF page, for debugging.')
 @click.argument('filepath', type=click.Path(exists=True))
 @pass_config
 def lattice(c, *args, **kwargs):
-    """Use lines between text to parse table."""
+    """Use lines between text to parse the table."""
     conf = c.config
     pages = conf.pop('pages')
     output = conf.pop('output')
@@ -105,29 +102,29 @@ def lattice(c, *args, **kwargs):
             table.plot(plot_type)
     else:
         if output is None:
-            raise click.UsageError('Please specify output filepath using --output')
+            raise click.UsageError('Please specify output file path using --output')
         if f is None:
-            raise click.UsageError('Please specify output format using --format')
+            raise click.UsageError('Please specify output file format using --format')
         tables.export(output, f=f, compress=compress)
 
 
 @cli.command('stream')
 @click.option('-T', '--table_area', default=[], multiple=True,
-              help='Table areas (x1,y1,x2,y2) to process.\n'
-              ' x1, y1 -> left-top and x2, y2 -> right-bottom')
+              help='Table areas to process. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-C', '--columns', default=[], multiple=True,
-              help='x-coordinates of column separators.')
-@click.option('-r', '--row_close_tol', default=2, help='Rows will be'
-              ' formed by combining text vertically within this tolerance.')
-@click.option('-c', '--col_close_tol', default=0, help='Columns will'
-              ' be formed by combining text horizontally within this tolerance.')
+              help='X coordinates of column separators.')
+@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter'
+              ' used to combine text vertically, to generate rows.')
+@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter'
+              ' used to combine text horizontally, to generate columns.')
 @click.option('-plot', '--plot_type',
               type=click.Choice(['text', 'table']),
               help='Plot geometry found on PDF page for debugging.')
 @click.argument('filepath', type=click.Path(exists=True))
 @pass_config
 def stream(c, *args, **kwargs):
-    """Use spaces between text to parse table."""
+    """Use spaces between text to parse the table."""
     conf = c.config
     pages = conf.pop('pages')
     output = conf.pop('output')
@@ -149,7 +146,7 @@ def stream(c, *args, **kwargs):
             table.plot(plot_type)
     else:
         if output is None:
-            raise click.UsageError('Please specify output filepath using --output')
+            raise click.UsageError('Please specify output file path using --output')
         if f is None:
-            raise click.UsageError('Please specify output format using --format')
+            raise click.UsageError('Please specify output file format using --format')
         tables.export(output, f=f, compress=compress)
\ No newline at end of file
diff --git a/camelot/core.py b/camelot/core.py
index a7f8d78..0658236 100644
--- a/camelot/core.py
+++ b/camelot/core.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 import json
 import zipfile
@@ -11,7 +13,7 @@ from .plotting import *
 
 class Cell(object):
     """Defines a cell in a table with coordinates relative to a
-    left-bottom origin. (pdf coordinate space)
+    left-bottom origin. (PDF coordinate space)
 
     Parameters
     ----------
@@ -89,7 +91,7 @@ class Cell(object):
 
 class Table(object):
     """Defines a table with coordinates relative to a left-bottom
-    origin. (pdf coordinate space)
+    origin. (PDF coordinate space)
 
     Parameters
     ----------
@@ -110,9 +112,9 @@ class Table(object):
     whitespace : float
         Percentage of whitespace in the table.
     order : int
-        Table number on pdf page.
+        Table number on PDF page.
     page : int
-        Pdf page number.
+        PDF page number.
 
     """
     def __init__(self, cols, rows):
diff --git a/camelot/handlers.py b/camelot/handlers.py
index 0ea9785..40f4074 100644
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 
 from PyPDF2 import PdfFileReader, PdfFileWriter
@@ -10,16 +12,16 @@ from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
 
 class PDFHandler(object):
     """Handles all operations like temp directory creation, splitting
-    file into single page pdfs, parsing each pdf and then removing the
+    file into single page PDFs, parsing each PDF and then removing the
     temp directory.
 
     Parameters
     ----------
     filename : str
-        Path to pdf file.
+        Path to PDF file.
     pages : str, optional (default: '1')
-        Comma-separated page numbers to parse.
-        Example: 1,3,4 or 1,4-end
+        Comma-separated page numbers.
+        Example: 1,3,4 or 1,4-end.
 
     """
     def __init__(self, filename, pages='1'):
@@ -34,10 +36,10 @@ class PDFHandler(object):
         Parameters
         ----------
         filename : str
-            Path to pdf file.
+            Path to PDF file.
         pages : str, optional (default: '1')
-            Comma-separated page numbers to parse.
-            Example: 1,3,4 or 1,4-end
+            Comma-separated page numbers.
+            Example: 1,3,4 or 1,4-end.
 
         Returns
         -------
@@ -67,16 +69,16 @@ class PDFHandler(object):
         return sorted(set(P))
 
     def _save_page(self, filename, page, temp):
-        """Saves specified page from pdf into a temporary directory.
+        """Saves specified page from PDF into a temporary directory.
 
         Parameters
         ----------
         filename : str
-            Path to pdf file.
+            Path to PDF file.
         page : int
-            Page number
+            Page number.
         temp : str
-            Tmp directory
+            Tmp directory.
 
         """
         with open(filename, 'rb') as fileobj:
@@ -91,7 +93,7 @@ class PDFHandler(object):
             with open(fpath, 'wb') as f:
                 outfile.write(f)
             layout, dim = get_page_layout(fpath)
-            # fix rotated pdf
+            # fix rotated PDF
             lttextlh = get_text_objects(layout, ltype="lh")
             lttextlv = get_text_objects(layout, ltype="lv")
             ltchar = get_text_objects(layout, ltype="char")
@@ -114,7 +116,7 @@ class PDFHandler(object):
 
     def parse(self, flavor='lattice', **kwargs):
         """Extracts tables by calling parser.get_tables on all single
-        page pdfs.
+        page PDFs.
 
         Parameters
         ----------
@@ -127,10 +129,10 @@ class PDFHandler(object):
         Returns
         -------
         tables : camelot.core.TableList
-            List of tables found in pdf.
+            List of tables found in PDF.
         geometry : camelot.core.GeometryList
-            List of geometry objects (contours, lines, joints)
-            found in pdf.
+            List of geometry objects (contours, lines, joints) found
+            in PDF.
 
         """
         tables = []
diff --git a/camelot/image_processing.py b/camelot/image_processing.py
index 23923b2..d3ae8ef 100644
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 from __future__ import division
 from itertools import groupby
 from operator import itemgetter
diff --git a/camelot/io.py b/camelot/io.py
index 3fdac0d..bdbcc69 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -1,9 +1,11 @@
+# -*- coding: utf-8 -*-
+
 from .handlers import PDFHandler
 from .utils import validate_input, remove_extra
 
 
 def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
-    """Read PDF and return parsed data tables.
+    """Read PDF and return extracted tables.
 
     Note: kwargs annotated with ^ can only be used with flavor='stream'
     and kwargs annotated with * can only be used with flavor='lattice'.
@@ -11,53 +13,47 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
     Parameters
     ----------
     filepath : str
-        Path to pdf file.
+        Path to PDF file.
     pages : str, optional (default: '1')
-        Comma-separated page numbers to parse.
-        Example: 1,3,4 or 1,4-end
+        Comma-separated page numbers.
+        Example: 1,3,4 or 1,4-end.
     flavor : str (default: 'lattice')
         The parsing method to use ('lattice' or 'stream').
         Lattice is used by default.
     table_area : list, optional (default: None)
-        List of table areas to process as strings of the form
-        x1,y1,x2,y2 where (x1, y1) -> left-top and
-        (x2, y2) -> right-bottom in pdf coordinate space.
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     columns^ : list, optional (default: None)
-        List of column x-coordinates as strings where the coordinates
+        List of column x-coordinates strings where the coordinates
         are comma-separated.
     split_text : bool, optional (default: False)
-        Whether or not to split a text line if it spans across
-        multiple cells.
+        Split text that spans across multiple cells.
     flag_size : bool, optional (default: False)
-        Whether or not to highlight a substring using 
-        if its size is different from rest of the string. (Useful for
-        super and subscripts)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds  around flagged text.
     row_close_tol^ : int, optional (default: 2)
-        Rows will be formed by combining text vertically
-        within this tolerance.
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
     col_close_tol^ : int, optional (default: 0)
-        Columns will be formed by combining text horizontally
-        within this tolerance.
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
     process_background* : bool, optional (default: False)
-        Whether or not to process lines that are in background.
+        Process background lines.
     line_size_scaling* : int, optional (default: 15)
-        Factor by which the page dimensions will be divided to get
-        smallest length of lines that should be detected.
-
-        The larger this value, smaller the detected lines. Making it
-        too large will lead to text being detected as lines.
+        Line size scaling factor. The larger the value the smaller
+        the detected lines. Making it very large will lead to text
+        being detected as lines.
     copy_text* : list, optional (default: None)
         {'h', 'v'}
-        Select one or more strings from above and pass them as a list
-        to specify the direction in which text should be copied over
-        when a cell spans multiple rows or columns.
+        Direction in which text in a spanning cell will be copied
+        over.
     shift_text* : list, optional (default: ['l', 't'])
         {'l', 'r', 't', 'b'}
-        Select one or more strings from above and pass them as a list
-        to specify where the text in a spanning cell should flow.
+        Direction in which text in a spanning cell will flow.
     line_close_tol* : int, optional (default: 2)
-        Tolerance parameter used to merge vertical and horizontal
-        detected lines which lie close to each other.
+        Tolerance parameter used to merge close vertical and horizontal
+        lines.
     joint_close_tol* : int, optional (default: 2)
         Tolerance parameter used to decide whether the detected lines
         and points lie close to each other.
@@ -76,7 +72,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', **kwargs):
 
         For more information, refer `OpenCV's dilate `_.
     margins : tuple
-        PDFMiner margins. (char_margin, line_margin, word_margin)
+        PDFMiner char_margin, line_margin and word_margin.
 
         For more information, refer `PDFMiner docs `_.
 
diff --git a/camelot/parsers/__init__.py b/camelot/parsers/__init__.py
index e046b46..9366b78 100644
--- a/camelot/parsers/__init__.py
+++ b/camelot/parsers/__init__.py
@@ -1,2 +1,4 @@
+# -*- coding: utf-8 -*-
+
 from .stream import Stream
 from .lattice import Lattice
\ No newline at end of file
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index 5035966..bd3de99 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 
 from ..utils import get_page_layout, get_text_objects
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index c4649e8..5219bc8 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 from __future__ import division
 import os
 import copy
@@ -21,41 +23,35 @@ logger = setup_logging(__name__)
 
 class Lattice(BaseParser):
     """Lattice method of parsing looks for lines between text
-    to parse table.
+    to parse the table.
 
     Parameters
     ----------
     table_area : list, optional (default: None)
-        List of table areas to analyze as strings of the form
-        x1,y1,x2,y2 where (x1, y1) -> left-top and
-        (x2, y2) -> right-bottom in pdf coordinate space.
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     process_background : bool, optional (default: False)
-        Whether or not to process lines that are in background.
+        Process background lines.
     line_size_scaling : int, optional (default: 15)
-        Factor by which the page dimensions will be divided to get
-        smallest length of lines that should be detected.
-
-        The larger this value, smaller the detected lines. Making it
-        too large will lead to text being detected as lines.
+        Line size scaling factor. The larger the value the smaller
+        the detected lines. Making it very large will lead to text
+        being detected as lines.
     copy_text : list, optional (default: None)
         {'h', 'v'}
-        Select one or more strings from above and pass them as a list
-        to specify the direction in which text should be copied over
-        when a cell spans multiple rows or columns.
+        Direction in which text in a spanning cell will be copied
+        over.
     shift_text : list, optional (default: ['l', 't'])
         {'l', 'r', 't', 'b'}
-        Select one or more strings from above and pass them as a list
-        to specify where the text in a spanning cell should flow.
+        Direction in which text in a spanning cell will flow.
     split_text : bool, optional (default: False)
-        Whether or not to split a text line if it spans across
-        multiple cells.
+        Split text that spans across multiple cells.
     flag_size : bool, optional (default: False)
-        Whether or not to highlight a substring using 
-        if its size is different from rest of the string. (Useful for
-        super and subscripts)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds  around flagged text.
     line_close_tol : int, optional (default: 2)
-        Tolerance parameter used to merge vertical and horizontal
-        detected lines which lie close to each other.
+        Tolerance parameter used to merge close vertical and horizontal
+        lines.
     joint_close_tol : int, optional (default: 2)
         Tolerance parameter used to decide whether the detected lines
         and points lie close to each other.
@@ -74,7 +70,7 @@ class Lattice(BaseParser):
 
         For more information, refer `OpenCV's dilate `_.
     margins : tuple
-        PDFMiner margins. (char_margin, line_margin, word_margin)
+        PDFMiner char_margin, line_margin and word_margin.
 
         For more information, refer `PDFMiner docs `_.
 
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 5a05dba..aa3c461 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 from __future__ import division
 import os
 import logging
@@ -16,7 +18,7 @@ logger = setup_logging(__name__)
 
 class Stream(BaseParser):
     """Stream method of parsing looks for spaces between text
-    to parse table.
+    to parse the table.
 
     If you want to specify columns when specifying multiple table
     areas, make sure that the length of both lists are equal.
@@ -24,27 +26,25 @@ class Stream(BaseParser):
     Parameters
     ----------
     table_area : list, optional (default: None)
-        List of table areas to analyze as strings of the form
-        x1,y1,x2,y2 where (x1, y1) -> left-top and
-        (x2, y2) -> right-bottom in pdf coordinate space.
+        List of table area strings of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     columns : list, optional (default: None)
-        List of column x-coordinates as strings where the coordinates
+        List of column x-coordinates strings where the coordinates
         are comma-separated.
     split_text : bool, optional (default: False)
-        Whether or not to split a text line if it spans across
-        multiple cells.
+        Split text that spans across multiple cells.
     flag_size : bool, optional (default: False)
-        Whether or not to highlight a substring using 
-        if its size is different from rest of the string. (Useful for
-        super and subscripts)
+        Flag text based on font size. Useful to detect
+        super/subscripts. Adds  around flagged text.
     row_close_tol : int, optional (default: 2)
-        Rows will be formed by combining text vertically
-        within this tolerance.
+        Tolerance parameter used to combine text vertically,
+        to generate rows.
     col_close_tol : int, optional (default: 0)
-        Columns will be formed by combining text horizontally
-        within this tolerance.
+        Tolerance parameter used to combine text horizontally,
+        to generate columns.
     margins : tuple, optional (default: (1.0, 0.5, 0.1))
-        PDFMiner margins. (char_margin, line_margin, word_margin)
+        PDFMiner char_margin, line_margin and word_margin.
 
         For more information, refer `PDFMiner docs `_.
 
diff --git a/setup.py b/setup.py
index 00d6e8f..439e402 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 from setuptools import find_packages
 from pkg_resources import parse_version
@@ -8,16 +10,8 @@ about = {}
 with open(os.path.join(here, 'camelot', '__version__.py'), 'r') as f:
     exec(f.read(), about)
 
-# TODO: Move these to __version__.py
-NAME = 'camelot-py'
-VERSION = about['__version__']
-DESCRIPTION = 'PDF Table Parsing for Humans'
-with open('README.md') as f:
-    LONG_DESCRIPTION = f.read()
-URL = 'https://github.com/socialcopsdev/camelot'
-AUTHOR = 'Vinayak Mehta'
-AUTHOR_EMAIL = 'vmehta94@gmail.com'
-LICENSE = 'MIT License'
+with open('README.md', 'r') as f:
+    readme = f.read()
 
 
 def setup_package():
@@ -31,14 +25,14 @@ def setup_package():
         for line in f:
             dev_reqs.append(line.strip())
 
-    metadata = dict(name=NAME,
-                    version=VERSION,
-                    description=DESCRIPTION,
-                    long_description=LONG_DESCRIPTION,
-                    url=URL,
-                    author=AUTHOR,
-                    author_email=AUTHOR_EMAIL,
-                    license=LICENSE,
+    metadata = dict(name=about['__title__'],
+                    version=about['__version__'],
+                    description=about['__description__'],
+                    long_description=readme,
+                    url=about['__url__'],
+                    author=about['__author__'],
+                    author_email=about['__author_email__'],
+                    license=about['__license__'],
                     packages=find_packages(exclude=('tests',)),
                     install_requires=reqs,
                     extras_require={
diff --git a/tests/test_common.py b/tests/test_common.py
index 065a9e2..10b852c 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 
 import pandas as pd