Merge branch 'master' of github.com:socialcopsdev/camelot into replace-gs-c-api

2019-01-05 11:22:38 +05:30 · 2019-01-05 11:22:38 +05:30 · ab5391c76f
parent cd6db09248 73498a9d67
commit ab5391c76f
21 changed files with 375 additions and 112 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -6,6 +6,18 @@ master
 **Improvements**
 * [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
    * You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
    * Kwarg `line_size_scaling` is now called `line_scale`.
 * [#212](https://github.com/socialcopsdev/camelot/issues/212) Add support to export as sqlite database. [#244](https://github.com/socialcopsdev/camelot/pull/244) by Vinayak Mehta.
 * [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
 0.6.0 (2018-12-24)
 ------------------
 **Improvements**
 * [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
 * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
    * `strip_text`: To define characters that should be stripped from each string.
    * `edge_tol`: Tolerance parameter for extending textedges vertically.
--- a/README.md
+++ b/README.md
@ -21,7 +21,7 @@
 >>> tables = camelot.read_pdf('foo.pdf')
 >>> tables
 &lt;TableList n=1&gt;
->>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html
+>>> tables.export('foo.csv', f='csv', compress=True) # json, excel, html, sqlite
 >>> tables[0]
 &lt;Table shape=(7, 7)&gt;
 >>> tables[0].parsing_report
@ -31,7 +31,7 @@
    'order': 1,
    'page': 1
 }
->>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
+>>> tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_sqlite
 >>> tables[0].df # get a pandas DataFrame!
 </pre>
@ -53,7 +53,7 @@ There's a [command-line interface](https://camelot-py.readthedocs.io/en/master/u
 - **You are in control.**: Unlike other libraries and tools which either give a nice output or fail miserably (with no in-between), Camelot gives you the power to tweak table extraction. (This is important since everything in the real world, including PDF table extraction, is fuzzy.)
 - *Bad* tables can be discarded based on **metrics** like accuracy and whitespace, without ever having to manually look at each table.
 - Each table is a **pandas DataFrame**, which seamlessly integrates into [ETL and data analysis workflows](https://gist.github.com/vinayak-mehta/e5949f7c2410a0e12f25d3682dc9e873).
- **Export** to multiple formats, including JSON, Excel and HTML.
+- **Export** to multiple formats, including JSON, Excel, HTML and Sqlite.
 See [comparison with other PDF table extraction libraries and tools](https://github.com/socialcopsdev/camelot/wiki/Comparison-with-other-PDF-Table-Extraction-libraries-and-tools).
--- a/camelot/version.py
+++ b/camelot/version.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-VERSION = (0, 5, 0)
+VERSION = (0, 6, 0)
 PRERELEASE = None # alpha, beta or rc
 REVISION = None
--- a/camelot/cli.py
+++ b/camelot/cli.py
@ -32,11 +32,11 @@ pass_config = click.make_pass_decorator(Config)
@click.version_option(version=__version__)
@click.option('-q', '--quiet', is_flag=False, help='Suppress logs and warnings.')
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
-              ' Example: 1,3,4 or 1,4-end.')
+              ' Example: 1,3,4 or 1,4-end or all.')
@click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
-              type=click.Choice(['csv', 'json', 'excel', 'html']),
+              type=click.Choice(['csv', 'json', 'excel', 'html', 'sqlite']),
              help='Output file format.')
@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.')
@click.option('-split', '--split_text', is_flag=True,
@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
@cli.command('lattice')
@click.option('-R', '--table_regions', default=[], multiple=True,
              help='Page regions to analyze. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
              help='Process background lines.')
-@click.option('-scale', '--line_size_scaling', default=15,
+@click.option('-scale', '--line_scale', default=15,
              help='Line size scaling factor. The larger the value,'
              ' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)
    table_regions = list(kwargs['table_regions'])
    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    copy_text = list(kwargs['copy_text'])
@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
@cli.command('stream')
@click.option('-R', '--table_regions', default=[], multiple=True,
              help='Page regions to analyze. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
              help='Table areas to process. Example: x1,y1,x2,y2'
              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
    filepath = kwargs.pop('filepath')
    kwargs.update(conf)
    table_regions = list(kwargs['table_regions'])
    kwargs['table_regions'] = None if not table_regions else table_regions
    table_areas = list(kwargs['table_areas'])
    kwargs['table_areas'] = None if not table_areas else table_areas
    columns = list(kwargs['columns'])
--- a/camelot/core.py
+++ b/camelot/core.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import os
 import sqlite3
 import zipfile
 import tempfile
 from itertools import chain
@ -592,6 +593,28 @@ class Table(object):
        with open(path, 'w') as f:
            f.write(html_string)
    def to_sqlite(self, path, **kwargs):
        """Writes Table to sqlite database.
        For kwargs, check :meth:`pandas.DataFrame.to_sql`.
        Parameters
        ----------
        path : str
            Output filepath.
        """
        kw = {
            'if_exists': 'replace',
            'index': False
        }
        kw.update(kwargs)
        conn = sqlite3.connect(path)
        table_name = 'page-{}-table-{}'.format(self.page, self.order)
        self.df.to_sql(table_name, conn, **kw)
        conn.commit()
        conn.close()
 class TableList(object):
    """Defines a list of camelot.core.Table objects. Each table can
@ -656,7 +679,7 @@ class TableList(object):
        path : str
            Output filepath.
        f : str
-            File format. Can be csv, json, excel and html.
+            File format. Can be csv, json, excel, html and sqlite.
        compress : bool
            Whether or not to add files to a ZIP archive.
@ -689,3 +712,11 @@ class TableList(object):
                zipname = os.path.join(os.path.dirname(path), root) + '.zip'
                with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
                    z.write(filepath, os.path.basename(filepath))
        elif f == 'sqlite':
            filepath = os.path.join(dirname, basename)
            for table in self._tables:
                table.to_sqlite(filepath)
            if compress:
                zipname = os.path.join(os.path.dirname(path), root) + '.zip'
                with zipfile.ZipFile(zipname, 'w', allowZip64=True) as z:
                    z.write(filepath, os.path.basename(filepath))
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
-                    get_rotation)
+                    get_rotation, is_url, download_url)
 class PDFHandler(object):
@ -18,20 +18,22 @@ class PDFHandler(object):
    Parameters
    ----------
-    filename : str
+    filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
-        Example: '1,3,4' or '1,4-end'.
+        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    """
-    def __init__(self, filename, pages='1', password=None):
+    def __init__(self, filepath, pages='1', password=None):
-        self.filename = filename
+        if is_url(filepath):
-        if not filename.lower().endswith('.pdf'):
+            filepath = download_url(filepath)
        self.filepath = filepath
        if not filepath.lower().endswith('.pdf'):
            raise NotImplementedError("File format not supported")
-        self.pages = self._get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filepath, pages)
        if password is None:
            self.password = ''
        else:
@ -39,16 +41,16 @@ class PDFHandler(object):
            if sys.version_info[0] < 3:
                self.password = self.password.encode('ascii')
-    def _get_pages(self, filename, pages):
+    def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
-        filename : str
+        filepath : str
-            Path to PDF file.
+            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
-            Example: 1,3,4 or 1,4-end.
+            Example: '1,3,4' or '1,4-end' or 'all'.
        Returns
        -------
@ -60,7 +62,7 @@ class PDFHandler(object):
        if pages == '1':
            page_numbers.append({'start': 1, 'end': 1})
        else:
-            infile = PdfFileReader(open(filename, 'rb'), strict=False)
+            infile = PdfFileReader(open(filepath, 'rb'), strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            if pages == 'all':
@ -79,20 +81,20 @@ class PDFHandler(object):
            P.extend(range(p['start'], p['end'] + 1))
        return sorted(set(P))
-    def _save_page(self, filename, page, temp):
+    def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
-        filename : str
+        filepath : str
-            Path to PDF file.
+            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
-        with open(filename, 'rb') as fileobj:
+        with open(filepath, 'rb') as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
@ -105,10 +107,10 @@ class PDFHandler(object):
                outfile.write(f)
            layout, dim = get_page_layout(fpath)
            # fix rotated PDF
-            lttextlh = get_text_objects(layout, ltype="lh")
+            chars = get_text_objects(layout, ltype="char")
-            lttextlv = get_text_objects(layout, ltype="lv")
+            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
-            ltchar = get_text_objects(layout, ltype="char")
+            vertical_text = get_text_objects(layout, ltype="vertical_text")
-            rotation = get_rotation(lttextlh, lttextlv, ltchar)
+            rotation = get_rotation(chars, horizontal_text, vertical_text)
            if rotation != '':
                fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
                os.rename(fpath, fpath_new)
@ -150,7 +152,7 @@ class PDFHandler(object):
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
-                self._save_page(self.filename, p, tempdir)
+                self._save_page(self.filepath, p, tempdir)
            pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
                     for p in self.pages]
            parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -48,7 +48,8 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    return img, threshold
-def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
+def find_lines(threshold, regions=None, direction='horizontal',
               line_scale=15, iterations=0):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
@ -56,9 +57,13 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
    regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in image coordinate space.
    direction : string, optional (default: 'horizontal')
        Specifies whether to find vertical or horizontal lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    lines = []
    if direction == 'vertical':
-        size = threshold.shape[0] // line_size_scaling
+        size = threshold.shape[0] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    elif direction == 'horizontal':
-        size = threshold.shape[1] // line_size_scaling
+        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
        raise ValueError("Specify direction as either 'vertical' or"
                         " 'horizontal'")
    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
            region_mask[y : y + h, x : x + w] = 1
        threshold = np.multiply(threshold, region_mask)
    threshold = cv2.erode(threshold, el)
    threshold = cv2.dilate(threshold, el)
    dmask = cv2.dilate(threshold, el, iterations=iterations)
    try:
        _, contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
    return dmask, lines
-def find_table_contours(vertical, horizontal):
+def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.
    Parameters
@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
    try:
        __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        # for opencv backward compatibility
        contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # sort in reverse based on contour area and use first 10 contours
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    cont = []
@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
    return cont
-def find_table_joints(contours, vertical, horizontal):
+def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.
    Parameters
@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
        and (x2, y2) -> rt in image coordinate space.
    """
-    joints = np.bitwise_and(vertical, horizontal)
+    joints = np.multiply(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            # for opencv backward compatibility
            jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than 4 joints
            continue
        joint_coords = []
--- a/camelot/io.py
+++ b/camelot/io.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 import warnings
 from .handlers import PDFHandler
@ -15,10 +16,10 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
    Parameters
    ----------
    filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
-        Example: '1,3,4' or '1,4-end'.
+        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    flavor : str (default: 'lattice')
@ -51,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
        to generate columns.
    process_background* : bool, optional (default: False)
        Process background lines.
-    line_size_scaling* : int, optional (default: 15)
+    line_scale* : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -13,7 +13,8 @@ class BaseParser(object):
        self.layout_kwargs = layout_kwargs
        self.layout, self.dimensions = get_page_layout(
            filename, **layout_kwargs)
-        self.horizontal_text = get_text_objects(self.layout, ltype="lh")
+        self.images = get_text_objects(self.layout, ltype='image')
-        self.vertical_text = get_text_objects(self.layout, ltype="lv")
+        self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
        self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -19,7 +19,7 @@ from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox,
                     merge_close_lines, get_table_index, compute_accuracy,
                     compute_whitespace)
 from ..image_processing import (adaptive_threshold, find_lines,
-                                find_table_contours, find_table_joints)
+                                find_contours, find_joints)
 logger = logging.getLogger('camelot')
@ -31,13 +31,17 @@ class Lattice(BaseParser):
    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    process_background : bool, optional (default: False)
        Process background lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
@ -80,14 +84,15 @@ class Lattice(BaseParser):
        Resolution used for PDF to PNG conversion.
    """
-    def __init__(self, table_areas=None, process_background=False,
+    def __init__(self, table_regions=None, table_areas=None, process_background=False,
-                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
+                 line_scale=15, copy_text=None, shift_text=['l', 't'],
                 split_text=False, flag_size=False, strip_text='', line_tol=2,
                 joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
                 iterations=0, resolution=300, **kwargs):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.process_background = process_background
-        self.line_size_scaling = line_size_scaling
+        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text
        self.split_text = split_text
@ -189,9 +194,22 @@ class Lattice(BaseParser):
        null.close()
    def _generate_table_bbox(self):
        def scale_areas(areas):
            scaled_areas = []
            for area in areas:
                x1, y1, x2, y2 = area.split(",")
                x1 = float(x1)
                y1 = float(y1)
                x2 = float(x2)
                y2 = float(y2)
                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            return scaled_areas
        self.image, self.threshold = adaptive_threshold(
            self.imagename, process_background=self.process_background,
            blocksize=self.threshold_blocksize, c=self.threshold_constant)
        image_width = self.image.shape[1]
        image_height = self.image.shape[0]
        image_width_scaler = image_width / float(self.pdf_width)
@ -201,27 +219,30 @@ class Lattice(BaseParser):
        image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
        pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
-        vertical_mask, vertical_segments = find_lines(
+        if self.table_areas is None:
-            self.threshold, direction='vertical',
+            regions = None
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
+            if self.table_regions is not None:
-        horizontal_mask, horizontal_segments = find_lines(
+                regions = scale_areas(self.table_regions)
            self.threshold, direction='horizontal',
            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
-        if self.table_areas is not None:
+            vertical_mask, vertical_segments = find_lines(
-            areas = []
+                self.threshold, regions=regions, direction='vertical',
-            for area in self.table_areas:
+                line_scale=self.line_scale, iterations=self.iterations)
-                x1, y1, x2, y2 = area.split(",")
+            horizontal_mask, horizontal_segments = find_lines(
-                x1 = float(x1)
+                self.threshold, regions=regions, direction='horizontal',
-                y1 = float(y1)
+                line_scale=self.line_scale, iterations=self.iterations)
-                x2 = float(x2)
+
-                y2 = float(y2)
+            contours = find_contours(vertical_mask, horizontal_mask)
-                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
        else:
-            contours = find_table_contours(vertical_mask, horizontal_mask)
+            vertical_mask, vertical_segments = find_lines(
-            table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
+                self.threshold, direction='vertical', line_scale=self.line_scale,
                iterations=self.iterations)
            horizontal_mask, horizontal_segments = find_lines(
                self.threshold, direction='horizontal', line_scale=self.line_scale,
                iterations=self.iterations)
            areas = scale_areas(self.table_areas)
            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
        self.table_bbox_unscaled = copy.deepcopy(table_bbox)
@ -318,8 +339,12 @@ class Lattice(BaseParser):
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))
        if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
+            if self.images:
-                os.path.basename(self.rootname)))
+                warnings.warn('{} is image-based, camelot only works on'
                              ' text-based pages.'.format(os.path.basename(self.rootname)))
            else:
                warnings.warn('No tables found on {}'.format(
                    os.path.basename(self.rootname)))
            return []
        self._generate_image()
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -26,6 +26,10 @@ class Stream(BaseParser):
    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -51,9 +55,10 @@ class Stream(BaseParser):
        to generate columns.
    """
-    def __init__(self, table_areas=None, columns=None, split_text=False,
+    def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
                 flag_size=False, strip_text='', edge_tol=50, row_tol=2,
                 column_tol=0, **kwargs):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.columns = columns
        self._validate_columns()
@ -275,7 +280,18 @@ class Stream(BaseParser):
    def _generate_table_bbox(self):
        self.textedges = []
-        if self.table_areas is not None:
+        if self.table_areas is None:
            hor_text = self.horizontal_text
            if self.table_regions is not None:
                # filter horizontal text
                hor_text = []
                for region in self.table_regions:
                    x1, y1, x2, y2 = region
                    region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
                    hor_text.extend(region_text)
            # find tables based on nurminen's detection algorithm
            table_bbox = self._nurminen_table_detection(hor_text)
        else:
            table_bbox = {}
            for area in self.table_areas:
                x1, y1, x2, y2 = area.split(",")
@ -284,9 +300,6 @@ class Stream(BaseParser):
                x2 = float(x2)
                y2 = float(y2)
                table_bbox[(x1, y2, x2, y1)] = None
        else:
            # find tables based on nurminen's detection algorithm
            table_bbox = self._nurminen_table_detection(self.horizontal_text)
        self.table_bbox = table_bbox
    def _generate_columns_and_rows(self, table_idx, tk):
@ -395,8 +408,12 @@ class Stream(BaseParser):
            logger.info('Processing {}'.format(os.path.basename(self.rootname)))
        if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
+            if self.images:
-                os.path.basename(self.rootname)))
+                warnings.warn('{} is image-based, camelot only works on'
                              ' text-based pages.'.format(os.path.basename(self.rootname)))
            else:
                warnings.warn('No tables found on {}'.format(
                    os.path.basename(self.rootname)))
            return []
        self._generate_table_bbox()
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,12 +1,17 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import os
 import sys
 import random
 import shutil
 import string
 import tempfile
 import warnings
 from itertools import groupby
 from operator import itemgetter
 import numpy as np
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -15,7 +20,78 @@ from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
-                             LTTextLineVertical)
+                             LTTextLineVertical, LTImage)
 PY3 = sys.version_info[0] >= 3
 if PY3:
    from urllib.request import urlopen
    from urllib.parse import urlparse as parse_url
    from urllib.parse import uses_relative, uses_netloc, uses_params
 else:
    from urllib2 import urlopen
    from urlparse import urlparse as parse_url
    from urlparse import uses_relative, uses_netloc, uses_params
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
 def is_url(url):
    """Check to see if a URL has a valid protocol.
    Parameters
    ----------
    url : str or unicode
    Returns
    -------
    isurl : bool
        If url has a valid protocol return True otherwise False.
    """
    try:
        return parse_url(url).scheme in _VALID_URLS
    except Exception:
        return False
 def random_string(length):
    ret = ''
    while length:
        ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
        length -= 1
    return ret
 def download_url(url):
    """Download file from specified URL.
    Parameters
    ----------
    url : str or unicode
    Returns
    -------
    filepath : str or unicode
        Temporary filepath.
    """
    filename = '{}.pdf'.format(random_string(6))
    with tempfile.NamedTemporaryFile('wb', delete=False) as f:
        obj = urlopen(url)
        if PY3:
            content_type = obj.info().get_content_type()
        else:
            content_type = obj.info().getheader('Content-Type')
        if content_type != 'application/pdf':
            raise NotImplementedError("File format not supported")
        f.write(obj.read())
    filepath = os.path.join(os.path.dirname(f.name), filename)
    shutil.move(f.name, filepath)
    return filepath
 stream_kwargs = [
@ -25,7 +101,7 @@ stream_kwargs = [
 ]
 lattice_kwargs = [
    'process_background',
-    'line_size_scaling',
+    'line_scale',
    'copy_text',
    'shift_text',
    'line_tol',
@ -194,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
    return tables_new, v_segments_new, h_segments_new
-def get_rotation(lttextlh, lttextlv, ltchar):
+def get_rotation(chars, horizontal_text, vertical_text):
    """Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.
    Parameters
    ----------
-    lttextlh : list
+    horizontal_text : list
        List of PDFMiner LTTextLineHorizontal objects.
-    lttextlv : list
+    vertical_text : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.
@ -216,11 +292,11 @@ def get_rotation(lttextlh, lttextlv, ltchar):
    """
    rotation = ''
-    hlen = len([t for t in lttextlh if t.get_text().strip()])
+    hlen = len([t for t in horizontal_text if t.get_text().strip()])
-    vlen = len([t for t in lttextlv if t.get_text().strip()])
+    vlen = len([t for t in vertical_text if t.get_text().strip()])
    if hlen < vlen:
-        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
+        clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in chars)
-        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
+        anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in chars)
        rotation = 'anticlockwise' if clockwise < anticlockwise else 'clockwise'
    return rotation
@ -263,7 +339,7 @@ def text_in_bbox(bbox, text):
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.
@ -637,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
        List of PDFMiner text objects.
    """
-    if ltype == "char":
+    if ltype == 'char':
        LTObject = LTChar
-    elif ltype == "lh":
+    elif ltype == 'image':
        LTObject = LTImage
    elif ltype == 'horizontal_text':
        LTObject = LTTextLineHorizontal
-    elif ltype == "lv":
+    elif ltype == 'vertical_text':
        LTObject = LTTextLineVertical
    if t is None:
        t = []
--- a/docs/_static/csv/table_regions.csv
+++ b/docs/_static/csv/table_regions.csv
@ -0,0 +1,4 @@
 "Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
 "18-75","1,00%"
 "76-80","0,50%"
 "81 in poi","0,10%"
--- a/docs/_static/pdf/table_regions.pdf
+++ b/docs/_static/pdf/table_regions.pdf
--- a/docs/user/advanced.rst
+++ b/docs/user/advanced.rst
@ -206,12 +206,10 @@ You can also visualize the textedges found on a page by specifying ``kind='texte
 Specify table areas
 -------------------
-In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
+In cases such as `these <../_static/pdf/table_areas.pdf>`__, it can be useful to specify exact table boundaries. You can plot the text on this page and note the top left and bottom right coordinates of the table.
 Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.
 .. _for now: https://github.com/socialcopsdev/camelot/issues/102
 ::
    >>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
@ -226,6 +224,27 @@ Table areas that you want Camelot to analyze can be passed as a list of comma-se
 .. csv-table::
  :file: ../_static/csv/table_areas.csv
 Specify table regions
 ---------------------
 However there may be cases like `[1] <../_static/pdf/table_regions.pdf>`__ and `[2] <https://github.com/socialcopsdev/camelot/blob/master/tests/files/tableception.pdf>`__, where the table might not lie at the exact coordinates every time but in an approximate region.
 You can use the ``table_regions`` keyword argument to :meth:`read_pdf() <camelot.read_pdf>` to solve for such cases. When ``table_regions`` is specified, Camelot will only analyze the specified regions to look for tables.
 ::
    >>> tables = camelot.read_pdf('table_regions.pdf', table_regions=['170,370,560,270'])
    >>> tables[0].df
 .. tip::
    Here's how you can do the same with the :ref:`command-line interface <cli>`.
    ::
        $ camelot lattice -R 170,370,560,270 table_regions.pdf
 .. csv-table::
  :file: ../_static/csv/table_regions.csv
 Specify column separators
 -------------------------
@ -434,11 +453,11 @@ You can pass ``row_tol=<+int>`` to group the rows closer together, as shown belo
 Detect short lines
 ------------------
-There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_size_scaling``. By default, its value is 15.
+There might be cases while using :ref:`Lattice <lattice>` when smaller lines don't get detected. The size of the smallest line that gets detected is calculated by dividing the PDF page's dimensions with a scaling factor called ``line_scale``. By default, its value is 15.
-As you can guess, the larger the ``line_size_scaling``, the smaller the size of lines getting detected.
+As you can guess, the larger the ``line_scale``, the smaller the size of lines getting detected.
-.. warning:: Making ``line_size_scaling`` very large (>150) will lead to text getting detected as lines.
+.. warning:: Making ``line_scale`` very large (>150) will lead to text getting detected as lines.
 Here's a `PDF <../_static/pdf/short_lines.pdf>`__ where small lines separating the the headers don't get detected with the default value of 15.
@ -458,11 +477,11 @@ Let's plot the table for this PDF.
    :alt: A plot of the PDF table with short lines
    :align: left
-Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_size_scaling=40``, and plot the table again.
+Clearly, the smaller lines separating the headers, couldn't be detected. Let's try with ``line_scale=40``, and plot the table again.
 ::
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40)
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40)
    >>> camelot.plot(tables[0], kind='grid')
    >>> plt.show()
@ -511,7 +530,7 @@ We'll use the `PDF <../_static/pdf/short_lines.pdf>`__ from the previous example
 ::
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=[''])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=[''])
    >>> tables[0].df
 .. csv-table::
@ -532,7 +551,7 @@ No surprises there — it did remain in place (observe the strings "2400" and "A
 ::
-    >>> tables = camelot.read_pdf('short_lines.pdf', line_size_scaling=40, shift_text=['r', 'b'])
+    >>> tables = camelot.read_pdf('short_lines.pdf', line_scale=40, shift_text=['r', 'b'])
    >>> tables[0].df
 .. tip::
--- a/docs/user/quickstart.rst
+++ b/docs/user/quickstart.rst
@ -14,7 +14,7 @@ Begin by importing the Camelot module::
    >>> import camelot
-Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here. To do that, we will set the ``mesh`` keyword argument to ``True``.
+Now, let's try to read a PDF. (You can check out the PDF used in this example `here`_.) Since the PDF has a table with clearly demarcated lines, we will use the :ref:`Lattice <lattice>` method here.
 .. note:: :ref:`Lattice <lattice>` is used by default. You can use :ref:`Stream <stream>` with ``flavor='stream'``.
@ -56,7 +56,7 @@ Woah! The accuracy is top-notch and there is less whitespace, which means the ta
 .. csv-table::
  :file: ../_static/csv/foo.csv
-Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` or :meth:`to_html() <camelot.core.Table.to_html>` methods to export the table as JSON, Excel and HTML files respectively.
+Looks good! You can now export the table as a CSV file using its :meth:`to_csv() <camelot.core.Table.to_csv>` method. Alternatively you can use :meth:`to_json() <camelot.core.Table.to_json>`, :meth:`to_excel() <camelot.core.Table.to_excel>` :meth:`to_html() <camelot.core.Table.to_html>` or :meth:`to_sqlite() <camelot.core.Table.to_sqlite>` methods to export the table as JSON, Excel, HTML files or a sqlite database respectively.
 ::
@ -76,7 +76,7 @@ You can also export all tables at once, using the :class:`tables <camelot.core.T
        $ camelot --format csv --output foo.csv lattice foo.pdf
-This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'`` or ``f='html'``.
+This will export all tables as CSV files at the path specified. Alternatively, you can use ``f='json'``, ``f='excel'``, ``f='html'`` or ``f='sqlite'``.
 .. note:: The :meth:`export() <camelot.core.TableList.export>` method exports files with a ``page-*-table-*`` suffix. In the example above, the single table in the list will be exported to ``foo-page-1-table-1.csv``. If the list contains multiple tables, multiple CSV files will be created. To avoid filling up your path with multiple files, you can use ``compress=True``, which will create a single ZIP file at your path with all the CSV files.
--- a/tests/data.py
+++ b/tests/data.py
@ -427,6 +427,13 @@ data_lattice_two_tables_2 = [
    ["Pooled", "23889", "47.7", "1.5", "9.9", "19.9", "17.8", "3.3"]
 ]
 data_lattice_table_regions = [
    ['Età dell’Assicurato \nall’epoca del decesso', 'Misura % di \nmaggiorazione'],
    ['18-75', '1,00%'],
    ['76-80', '0,50%'],
    ['81 in poi', '0,10%']
 ]
 data_lattice_table_areas = [
    ["", "", "", "", "", "", "", "", ""],
    ["State", "n", "Literacy Status", "", "", "", "", "", ""],
--- a/tests/files/image.pdf
+++ b/tests/files/image.pdf
--- a/tests/files/table_region.pdf
+++ b/tests/files/table_region.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -159,6 +159,14 @@ def test_lattice_two_tables():
    assert df2.equals(tables[1].df)
 def test_lattice_table_regions():
    df = pd.DataFrame(data_lattice_table_regions)
    filename = os.path.join(testdir, "table_region.pdf")
    tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
    assert df.equals(tables[0].df)
 def test_lattice_table_areas():
    df = pd.DataFrame(data_lattice_table_areas)
@ -179,7 +187,7 @@ def test_lattice_copy_text():
    df = pd.DataFrame(data_lattice_copy_text)
    filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
+    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
    assert df.equals(tables[0].df)
@ -189,13 +197,13 @@ def test_lattice_shift_text():
    df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
    filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_size_scaling=40)
+    tables = camelot.read_pdf(filename, line_scale=40)
    assert df_lt.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[''])
    assert df_disable.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
+    tables = camelot.read_pdf(filename, line_scale=40, shift_text=['r', 'b'])
    assert df_rb.equals(tables[0].df)
@ -207,6 +215,32 @@ def test_repr():
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_pages():
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    tables = camelot.read_pdf(url, pages='1-end')
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    tables = camelot.read_pdf(url, pages='all')
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_url():
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_arabic():
    df = pd.DataFrame(data_arabic)
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@ -41,6 +41,15 @@ def test_stream_equal_length():
            table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
 def test_image_warning():
    filename = os.path.join(testdir, 'image.pdf')
    with warnings.catch_warnings():
        warnings.simplefilter('error')
        with pytest.raises(UserWarning) as e:
            tables = camelot.read_pdf(filename)
        assert str(e.value) == 'page-1 is image-based, camelot only works on text-based pages.'
 def test_no_tables_found():
    filename = os.path.join(testdir, 'blank.pdf')
    with warnings.catch_warnings():