diff --git a/camelot/handlers.py b/camelot/handlers.py index 35708ee..e0dd8c7 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter from .core import TableList from .parsers import Stream, Lattice from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, - get_rotation) + get_rotation, is_url, download_url) class PDFHandler(object): @@ -18,8 +18,8 @@ class PDFHandler(object): Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end'. @@ -27,11 +27,13 @@ class PDFHandler(object): Password for decryption. """ - def __init__(self, filename, pages='1', password=None): - self.filename = filename - if not filename.lower().endswith('.pdf'): + def __init__(self, filepath, pages='1', password=None): + if is_url(filepath): + filepath = download_url(filepath) + self.filepath = filepath + if not filepath.lower().endswith('.pdf'): raise NotImplementedError("File format not supported") - self.pages = self._get_pages(self.filename, pages) + self.pages = self._get_pages(self.filepath, pages) if password is None: self.password = '' else: @@ -39,13 +41,13 @@ class PDFHandler(object): if sys.version_info[0] < 3: self.password = self.password.encode('ascii') - def _get_pages(self, filename, pages): + def _get_pages(self, filepath, pages): """Converts pages string to list of ints. Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: 1,3,4 or 1,4-end. @@ -60,7 +62,7 @@ class PDFHandler(object): if pages == '1': page_numbers.append({'start': 1, 'end': 1}) else: - infile = PdfFileReader(open(filename, 'rb'), strict=False) + infile = PdfFileReader(open(filepath, 'rb'), strict=False) if infile.isEncrypted: infile.decrypt(self.password) if pages == 'all': @@ -79,20 +81,20 @@ class PDFHandler(object): P.extend(range(p['start'], p['end'] + 1)) return sorted(set(P)) - def _save_page(self, filename, page, temp): + def _save_page(self, filepath, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- - filename : str - Path to PDF file. + filepath : str + Filepath or URL of the PDF file. page : int Page number. temp : str Tmp directory. """ - with open(filename, 'rb') as fileobj: + with open(filepath, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: infile.decrypt(self.password) @@ -150,7 +152,7 @@ class PDFHandler(object): tables = [] with TemporaryDirectory() as tempdir: for p in self.pages: - self._save_page(self.filename, p, tempdir) + self._save_page(self.filepath, p, tempdir) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) diff --git a/camelot/io.py b/camelot/io.py index 96ffa27..bff5974 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Parameters ---------- filepath : str - Path to PDF file. + Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end'. diff --git a/camelot/utils.py b/camelot/utils.py index 88564f7..82aa493 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,12 +1,17 @@ +# -*- coding: utf-8 -*- from __future__ import division + +import os +import sys +import random import shutil +import string import tempfile import warnings from itertools import groupby from operator import itemgetter import numpy as np - from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -18,6 +23,73 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, LTTextLineVertical) +PY3 = sys.version_info[0] >= 3 +if PY3: + from urllib.request import urlopen + from urllib.parse import urlparse as parse_url + from urllib.parse import uses_relative, uses_netloc, uses_params +else: + from urllib2 import urlopen + from urlparse import urlparse as parse_url + from urlparse import uses_relative, uses_netloc, uses_params + + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard('') + + +# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py +def is_url(url): + """Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If url has a valid protocol return True otherwise False. + + """ + try: + return parse_url(url).scheme in _VALID_URLS + except Exception: + return False + + +def random_string(length): + ret = '' + while length: + ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase) + length -= 1 + return ret + + +def download_url(url): + """Download file from specified URL. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + filepath : str or unicode + Temporary filepath. + + """ + filename = '{}.pdf'.format(random_string(6)) + with tempfile.NamedTemporaryFile('wb', delete=False) as f: + obj = urlopen(url) + if obj.info().get_content_type() != 'application/pdf': + raise NotImplementedError("File format not supported") + f.write(obj.read()) + filepath = os.path.join(os.path.dirname(f.name), filename) + shutil.move(f.name, filepath) + return filepath + + stream_kwargs = [ 'columns', 'row_tol', diff --git a/tests/test_common.py b/tests/test_common.py index 83c436b..0d95a02 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -207,6 +207,14 @@ def test_repr(): assert repr(tables[0].cells[0][0]) == "" +def test_url(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url) + assert repr(tables) == "" + assert repr(tables[0]) == "" + assert repr(tables[0].cells[0][0]) == "" + + def test_arabic(): df = pd.DataFrame(data_arabic)