Add support to read from url
parent
0198f5527c
commit
2b3461deab
|
|
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
|||
from .core import TableList
|
||||
from .parsers import Stream, Lattice
|
||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||
get_rotation)
|
||||
get_rotation, is_url, download_url)
|
||||
|
||||
|
||||
class PDFHandler(object):
|
||||
|
|
@ -18,8 +18,8 @@ class PDFHandler(object):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to PDF file.
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end'.
|
||||
|
|
@ -27,11 +27,13 @@ class PDFHandler(object):
|
|||
Password for decryption.
|
||||
|
||||
"""
|
||||
def __init__(self, filename, pages='1', password=None):
|
||||
self.filename = filename
|
||||
if not filename.lower().endswith('.pdf'):
|
||||
def __init__(self, filepath, pages='1', password=None):
|
||||
if is_url(filepath):
|
||||
filepath = download_url(filepath)
|
||||
self.filepath = filepath
|
||||
if not filepath.lower().endswith('.pdf'):
|
||||
raise NotImplementedError("File format not supported")
|
||||
self.pages = self._get_pages(self.filename, pages)
|
||||
self.pages = self._get_pages(self.filepath, pages)
|
||||
if password is None:
|
||||
self.password = ''
|
||||
else:
|
||||
|
|
@ -39,13 +41,13 @@ class PDFHandler(object):
|
|||
if sys.version_info[0] < 3:
|
||||
self.password = self.password.encode('ascii')
|
||||
|
||||
def _get_pages(self, filename, pages):
|
||||
def _get_pages(self, filepath, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to PDF file.
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: 1,3,4 or 1,4-end.
|
||||
|
|
@ -60,7 +62,7 @@ class PDFHandler(object):
|
|||
if pages == '1':
|
||||
page_numbers.append({'start': 1, 'end': 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
if pages == 'all':
|
||||
|
|
@ -79,20 +81,20 @@ class PDFHandler(object):
|
|||
P.extend(range(p['start'], p['end'] + 1))
|
||||
return sorted(set(P))
|
||||
|
||||
def _save_page(self, filename, page, temp):
|
||||
def _save_page(self, filepath, page, temp):
|
||||
"""Saves specified page from PDF into a temporary directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str
|
||||
Path to PDF file.
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
page : int
|
||||
Page number.
|
||||
temp : str
|
||||
Tmp directory.
|
||||
|
||||
"""
|
||||
with open(filename, 'rb') as fileobj:
|
||||
with open(filepath, 'rb') as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
|
|
@ -150,7 +152,7 @@ class PDFHandler(object):
|
|||
tables = []
|
||||
with TemporaryDirectory() as tempdir:
|
||||
for p in self.pages:
|
||||
self._save_page(self.filename, p, tempdir)
|
||||
self._save_page(self.filepath, p, tempdir)
|
||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||
for p in self.pages]
|
||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
|||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Path to PDF file.
|
||||
Filepath or URL of the PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end'.
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import division
|
||||
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import shutil
|
||||
import string
|
||||
import tempfile
|
||||
import warnings
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
|
@ -18,6 +23,73 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
|||
LTTextLineVertical)
|
||||
|
||||
|
||||
PY3 = sys.version_info[0] >= 3
|
||||
if PY3:
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urlparse as parse_url
|
||||
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||
else:
|
||||
from urllib2 import urlopen
|
||||
from urlparse import urlparse as parse_url
|
||||
from urlparse import uses_relative, uses_netloc, uses_params
|
||||
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard('')
|
||||
|
||||
|
||||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||
def is_url(url):
|
||||
"""Check to see if a URL has a valid protocol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str or unicode
|
||||
|
||||
Returns
|
||||
-------
|
||||
isurl : bool
|
||||
If url has a valid protocol return True otherwise False.
|
||||
|
||||
"""
|
||||
try:
|
||||
return parse_url(url).scheme in _VALID_URLS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def random_string(length):
|
||||
ret = ''
|
||||
while length:
|
||||
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
|
||||
length -= 1
|
||||
return ret
|
||||
|
||||
|
||||
def download_url(url):
|
||||
"""Download file from specified URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str or unicode
|
||||
|
||||
Returns
|
||||
-------
|
||||
filepath : str or unicode
|
||||
Temporary filepath.
|
||||
|
||||
"""
|
||||
filename = '{}.pdf'.format(random_string(6))
|
||||
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
|
||||
obj = urlopen(url)
|
||||
if obj.info().get_content_type() != 'application/pdf':
|
||||
raise NotImplementedError("File format not supported")
|
||||
f.write(obj.read())
|
||||
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||
shutil.move(f.name, filepath)
|
||||
return filepath
|
||||
|
||||
|
||||
stream_kwargs = [
|
||||
'columns',
|
||||
'row_tol',
|
||||
|
|
|
|||
|
|
@ -207,6 +207,14 @@ def test_repr():
|
|||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
|
||||
|
||||
def test_url():
|
||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||
tables = camelot.read_pdf(url)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
|
||||
|
||||
def test_arabic():
|
||||
df = pd.DataFrame(data_arabic)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue