Merge pull request #236 from socialcopsdev/read_url

[MRG] Add support to read from url
2018-12-24 13:29:41 +05:30
parent 0198f5527c 62ed4753cd
commit 0b85c77425
5 changed files with 105 additions and 18 deletions
@@ -6,6 +6,7 @@ master

 **Improvements**

+* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
 * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
    * `strip_text`: To define characters that should be stripped from each string.
    * `edge_tol`: Tolerance parameter for extending textedges vertically.
@@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
-                    get_rotation)
+                    get_rotation, is_url, download_url)


 class PDFHandler(object):
@@ -18,8 +18,8 @@ class PDFHandler(object):

    Parameters
    ----------
-    filename : str
-        Path to PDF file.
+    filepath : str
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end'.
@@ -27,11 +27,13 @@ class PDFHandler(object):
        Password for decryption.

    """
-    def __init__(self, filename, pages='1', password=None):
-        self.filename = filename
-        if not filename.lower().endswith('.pdf'):
+    def __init__(self, filepath, pages='1', password=None):
+        if is_url(filepath):
+            filepath = download_url(filepath)
+        self.filepath = filepath
+        if not filepath.lower().endswith('.pdf'):
            raise NotImplementedError("File format not supported")
-        self.pages = self._get_pages(self.filename, pages)
+        self.pages = self._get_pages(self.filepath, pages)
        if password is None:
            self.password = ''
        else:
@@ -39,13 +41,13 @@ class PDFHandler(object):
            if sys.version_info[0] < 3:
                self.password = self.password.encode('ascii')

-    def _get_pages(self, filename, pages):
+    def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.

        Parameters
        ----------
-        filename : str
-            Path to PDF file.
+        filepath : str
+            Filepath or URL of the PDF file.
        pages : str, optional (default: '1')
            Comma-separated page numbers.
            Example: 1,3,4 or 1,4-end.
@@ -60,7 +62,7 @@ class PDFHandler(object):
        if pages == '1':
            page_numbers.append({'start': 1, 'end': 1})
        else:
-            infile = PdfFileReader(open(filename, 'rb'), strict=False)
+            infile = PdfFileReader(open(filepath, 'rb'), strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
            if pages == 'all':
@@ -79,20 +81,20 @@ class PDFHandler(object):
            P.extend(range(p['start'], p['end'] + 1))
        return sorted(set(P))

-    def _save_page(self, filename, page, temp):
+    def _save_page(self, filepath, page, temp):
        """Saves specified page from PDF into a temporary directory.

        Parameters
        ----------
-        filename : str
-            Path to PDF file.
+        filepath : str
+            Filepath or URL of the PDF file.
        page : int
            Page number.
        temp : str
            Tmp directory.

        """
-        with open(filename, 'rb') as fileobj:
+        with open(filepath, 'rb') as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
@@ -150,7 +152,7 @@ class PDFHandler(object):
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
-                self._save_page(self.filename, p, tempdir)
+                self._save_page(self.filepath, p, tempdir)
            pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
                     for p in self.pages]
            parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
@@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
    Parameters
    ----------
    filepath : str
-        Path to PDF file.
+        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end'.
@@ -1,12 +1,17 @@
+# -*- coding: utf-8 -*-
 from __future__ import division
+
+import os
+import sys
+import random
 import shutil
+import string
 import tempfile
 import warnings
 from itertools import groupby
 from operator import itemgetter

 import numpy as np
-
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
@@ -18,6 +23,77 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
                             LTTextLineVertical)


+PY3 = sys.version_info[0] >= 3
+if PY3:
+    from urllib.request import urlopen
+    from urllib.parse import urlparse as parse_url
+    from urllib.parse import uses_relative, uses_netloc, uses_params
+else:
+    from urllib2 import urlopen
+    from urlparse import urlparse as parse_url
+    from urlparse import uses_relative, uses_netloc, uses_params
+
+
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard('')
+
+
+# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
+def is_url(url):
+    """Check to see if a URL has a valid protocol.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    isurl : bool
+        If url has a valid protocol return True otherwise False.
+
+    """
+    try:
+        return parse_url(url).scheme in _VALID_URLS
+    except Exception:
+        return False
+
+
+def random_string(length):
+    ret = ''
+    while length:
+        ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
+        length -= 1
+    return ret
+
+
+def download_url(url):
+    """Download file from specified URL.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    filepath : str or unicode
+        Temporary filepath.
+
+    """
+    filename = '{}.pdf'.format(random_string(6))
+    with tempfile.NamedTemporaryFile('wb', delete=False) as f:
+        obj = urlopen(url)
+        if PY3:
+            content_type = obj.info().get_content_type()
+        else:
+            content_type = obj.info().getheader('Content-Type')
+        if content_type != 'application/pdf':
+            raise NotImplementedError("File format not supported")
+        f.write(obj.read())
+    filepath = os.path.join(os.path.dirname(f.name), filename)
+    shutil.move(f.name, filepath)
+    return filepath
+
+
 stream_kwargs = [
    'columns',
    'row_tol',
@@ -207,6 +207,14 @@ def test_repr():
    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"


+def test_url():
+    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
+    tables = camelot.read_pdf(url)
+    assert repr(tables) == "<TableList n=1>"
+    assert repr(tables[0]) == "<Table shape=(7, 7)>"
+    assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
+
+
 def test_arabic():
    df = pd.DataFrame(data_arabic)