Merge pull request #236 from socialcopsdev/read_url

[MRG] Add support to read from url
2018-12-24 13:29:41 +05:30 · 2018-12-24 13:29:41 +05:30 · 0b85c77425
parent 0198f5527c 62ed4753cd
commit 0b85c77425
5 changed files with 105 additions and 18 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -6,6 +6,7 @@ master
 **Improvements**
 * [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
 * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
    * `strip_text`: To define characters that should be stripped from each string.
    * `edge_tol`: Tolerance parameter for extending textedges vertically.
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
-                    get_rotation)
+                    get_rotation, is_url, download_url)
 class PDFHandler(object):
@ -18,8 +18,8 @@ class PDFHandler(object):
    Parameters
    ----------
-    filename : str
+    filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end'.
@ -27,11 +27,13 @@ class PDFHandler(object):
        Password for decryption.
    """
-    def __init__(self, filename, pages='1', password=None):
+    def __init__(self, filepath, pages='1', password=None):
-        self.filename = filename
+        if is_url(filepath):
-        if not filename.lower().endswith('.pdf'):
+            filepath = download_url(filepath)
        self.filepath = filepath
        if not filepath.lower().endswith('.pdf'):
            raise NotImplementedError("File format not supported")
-        self.pages = self._get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filepath, pages)
        if password is None:
            self.password = ''
        else:
@ -39,13 +41,13 @@ class PDFHandler(object):
            if sys.version_info[0] < 3:
                self.password = self.password.encode('ascii')
-    def _get_pages(self, filename, pages):
+    def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
        Parameters
        ----------
-        filename : str
+        filepath : str
-            Path to PDF file.
+            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: 1,3,4 or 1,4-end.
@ -60,7 +62,7 @@ class PDFHandler(object):
        if pages == '1':
            page_numbers.append({'start': 1, 'end': 1})
        else:
-            infile = PdfFileReader(open(filename, 'rb'), strict=False)
+            infile = PdfFileReader(open(filepath, 'rb'), strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            if pages == 'all':
@ -79,20 +81,20 @@ class PDFHandler(object):
            P.extend(range(p['start'], p['end'] + 1))
        return sorted(set(P))
-    def _save_page(self, filename, page, temp):
+    def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.
        Parameters
        ----------
-        filename : str
+        filepath : str
-            Path to PDF file.
+            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.
        """
-        with open(filename, 'rb') as fileobj:
+        with open(filepath, 'rb') as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
@ -150,7 +152,7 @@ class PDFHandler(object):
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
-                self._save_page(self.filename, p, tempdir)
+                self._save_page(self.filepath, p, tempdir)
            pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
                     for p in self.pages]
            parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
    Parameters
    ----------
    filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end'.
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,12 +1,17 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import os
 import sys
 import random
 import shutil
 import string
 import tempfile
 import warnings
 from itertools import groupby
 from operator import itemgetter
 import numpy as np
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@ -18,6 +23,77 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
                             LTTextLineVertical)
 PY3 = sys.version_info[0] >= 3
 if PY3:
    from urllib.request import urlopen
    from urllib.parse import urlparse as parse_url
    from urllib.parse import uses_relative, uses_netloc, uses_params
 else:
    from urllib2 import urlopen
    from urlparse import urlparse as parse_url
    from urlparse import uses_relative, uses_netloc, uses_params
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
 def is_url(url):
    """Check to see if a URL has a valid protocol.
    Parameters
    ----------
    url : str or unicode
    Returns
    -------
    isurl : bool
        If url has a valid protocol return True otherwise False.
    """
    try:
        return parse_url(url).scheme in _VALID_URLS
    except Exception:
        return False
 def random_string(length):
    ret = ''
    while length:
        ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
        length -= 1
    return ret
 def download_url(url):
    """Download file from specified URL.
    Parameters
    ----------
    url : str or unicode
    Returns
    -------
    filepath : str or unicode
        Temporary filepath.
    """
    filename = '{}.pdf'.format(random_string(6))
    with tempfile.NamedTemporaryFile('wb', delete=False) as f:
        obj = urlopen(url)
        if PY3:
            content_type = obj.info().get_content_type()
        else:
            content_type = obj.info().getheader('Content-Type')
        if content_type != 'application/pdf':
            raise NotImplementedError("File format not supported")
        f.write(obj.read())
    filepath = os.path.join(os.path.dirname(f.name), filename)
    shutil.move(f.name, filepath)
    return filepath
 stream_kwargs = [
    'columns',
    'row_tol',
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -207,6 +207,14 @@ def test_repr():
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_url():
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
    tables = camelot.read_pdf(url)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
 def test_arabic():
    df = pd.DataFrame(data_arabic)