Add support to read from url
parent
0198f5527c
commit
2b3461deab
|
|
@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice
|
from .parsers import Stream, Lattice
|
||||||
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
|
||||||
get_rotation)
|
get_rotation, is_url, download_url)
|
||||||
|
|
||||||
|
|
||||||
class PDFHandler(object):
|
class PDFHandler(object):
|
||||||
|
|
@ -18,8 +18,8 @@ class PDFHandler(object):
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: '1,3,4' or '1,4-end'.
|
Example: '1,3,4' or '1,4-end'.
|
||||||
|
|
@ -27,11 +27,13 @@ class PDFHandler(object):
|
||||||
Password for decryption.
|
Password for decryption.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, pages='1', password=None):
|
def __init__(self, filepath, pages='1', password=None):
|
||||||
self.filename = filename
|
if is_url(filepath):
|
||||||
if not filename.lower().endswith('.pdf'):
|
filepath = download_url(filepath)
|
||||||
|
self.filepath = filepath
|
||||||
|
if not filepath.lower().endswith('.pdf'):
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
self.pages = self._get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filepath, pages)
|
||||||
if password is None:
|
if password is None:
|
||||||
self.password = ''
|
self.password = ''
|
||||||
else:
|
else:
|
||||||
|
|
@ -39,13 +41,13 @@ class PDFHandler(object):
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
self.password = self.password.encode('ascii')
|
self.password = self.password.encode('ascii')
|
||||||
|
|
||||||
def _get_pages(self, filename, pages):
|
def _get_pages(self, filepath, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end.
|
Example: 1,3,4 or 1,4-end.
|
||||||
|
|
@ -60,7 +62,7 @@ class PDFHandler(object):
|
||||||
if pages == '1':
|
if pages == '1':
|
||||||
page_numbers.append({'start': 1, 'end': 1})
|
page_numbers.append({'start': 1, 'end': 1})
|
||||||
else:
|
else:
|
||||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
if pages == 'all':
|
if pages == 'all':
|
||||||
|
|
@ -79,20 +81,20 @@ class PDFHandler(object):
|
||||||
P.extend(range(p['start'], p['end'] + 1))
|
P.extend(range(p['start'], p['end'] + 1))
|
||||||
return sorted(set(P))
|
return sorted(set(P))
|
||||||
|
|
||||||
def _save_page(self, filename, page, temp):
|
def _save_page(self, filepath, page, temp):
|
||||||
"""Saves specified page from PDF into a temporary directory.
|
"""Saves specified page from PDF into a temporary directory.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filename : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
page : int
|
page : int
|
||||||
Page number.
|
Page number.
|
||||||
temp : str
|
temp : str
|
||||||
Tmp directory.
|
Tmp directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filepath, 'rb') as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt(self.password)
|
infile.decrypt(self.password)
|
||||||
|
|
@ -150,7 +152,7 @@ class PDFHandler(object):
|
||||||
tables = []
|
tables = []
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
for p in self.pages:
|
for p in self.pages:
|
||||||
self._save_page(self.filename, p, tempdir)
|
self._save_page(self.filepath, p, tempdir)
|
||||||
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
|
||||||
for p in self.pages]
|
for p in self.pages]
|
||||||
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
filepath : str
|
filepath : str
|
||||||
Path to PDF file.
|
Filepath or URL of the PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: '1,3,4' or '1,4-end'.
|
Example: '1,3,4' or '1,4-end'.
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,17 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
|
import string
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
@ -18,6 +23,73 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
|
||||||
LTTextLineVertical)
|
LTTextLineVertical)
|
||||||
|
|
||||||
|
|
||||||
|
PY3 = sys.version_info[0] >= 3
|
||||||
|
if PY3:
|
||||||
|
from urllib.request import urlopen
|
||||||
|
from urllib.parse import urlparse as parse_url
|
||||||
|
from urllib.parse import uses_relative, uses_netloc, uses_params
|
||||||
|
else:
|
||||||
|
from urllib2 import urlopen
|
||||||
|
from urlparse import urlparse as parse_url
|
||||||
|
from urlparse import uses_relative, uses_netloc, uses_params
|
||||||
|
|
||||||
|
|
||||||
|
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||||
|
_VALID_URLS.discard('')
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||||
|
def is_url(url):
|
||||||
|
"""Check to see if a URL has a valid protocol.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str or unicode
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
isurl : bool
|
||||||
|
If url has a valid protocol return True otherwise False.
|
||||||
|
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return parse_url(url).scheme in _VALID_URLS
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def random_string(length):
|
||||||
|
ret = ''
|
||||||
|
while length:
|
||||||
|
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
|
||||||
|
length -= 1
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def download_url(url):
|
||||||
|
"""Download file from specified URL.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
url : str or unicode
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
filepath : str or unicode
|
||||||
|
Temporary filepath.
|
||||||
|
|
||||||
|
"""
|
||||||
|
filename = '{}.pdf'.format(random_string(6))
|
||||||
|
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
|
||||||
|
obj = urlopen(url)
|
||||||
|
if obj.info().get_content_type() != 'application/pdf':
|
||||||
|
raise NotImplementedError("File format not supported")
|
||||||
|
f.write(obj.read())
|
||||||
|
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||||
|
shutil.move(f.name, filepath)
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
stream_kwargs = [
|
stream_kwargs = [
|
||||||
'columns',
|
'columns',
|
||||||
'row_tol',
|
'row_tol',
|
||||||
|
|
|
||||||
|
|
@ -207,6 +207,14 @@ def test_repr():
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
|
def test_url():
|
||||||
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
|
tables = camelot.read_pdf(url)
|
||||||
|
assert repr(tables) == "<TableList n=1>"
|
||||||
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||||
|
|
||||||
|
|
||||||
def test_arabic():
|
def test_arabic():
|
||||||
df = pd.DataFrame(data_arabic)
|
df = pd.DataFrame(data_arabic)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue