Merge pull request #236 from socialcopsdev/read_url

[MRG] Add support to read from url
pull/2/head
Vinayak Mehta 2018-12-24 13:29:41 +05:30 committed by GitHub
commit 0b85c77425
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 18 deletions

View File

@ -6,6 +6,7 @@ master
**Improvements** **Improvements**
* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta. * [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
* `strip_text`: To define characters that should be stripped from each string. * `strip_text`: To define characters that should be stripped from each string.
* `edge_tol`: Tolerance parameter for extending textedges vertically. * `edge_tol`: Tolerance parameter for extending textedges vertically.

View File

@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects, from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation) get_rotation, is_url, download_url)
class PDFHandler(object): class PDFHandler(object):
@ -18,8 +18,8 @@ class PDFHandler(object):
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'. Example: '1,3,4' or '1,4-end'.
@ -27,11 +27,13 @@ class PDFHandler(object):
Password for decryption. Password for decryption.
""" """
def __init__(self, filename, pages='1', password=None): def __init__(self, filepath, pages='1', password=None):
self.filename = filename if is_url(filepath):
if not filename.lower().endswith('.pdf'): filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages) self.pages = self._get_pages(self.filepath, pages)
if password is None: if password is None:
self.password = '' self.password = ''
else: else:
@ -39,13 +41,13 @@ class PDFHandler(object):
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
self.password = self.password.encode('ascii') self.password = self.password.encode('ascii')
def _get_pages(self, filename, pages): def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints. """Converts pages string to list of ints.
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: 1,3,4 or 1,4-end. Example: 1,3,4 or 1,4-end.
@ -60,7 +62,7 @@ class PDFHandler(object):
if pages == '1': if pages == '1':
page_numbers.append({'start': 1, 'end': 1}) page_numbers.append({'start': 1, 'end': 1})
else: else:
infile = PdfFileReader(open(filename, 'rb'), strict=False) infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
if pages == 'all': if pages == 'all':
@ -79,20 +81,20 @@ class PDFHandler(object):
P.extend(range(p['start'], p['end'] + 1)) P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P)) return sorted(set(P))
def _save_page(self, filename, page, temp): def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory. """Saves specified page from PDF into a temporary directory.
Parameters Parameters
---------- ----------
filename : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
page : int page : int
Page number. Page number.
temp : str temp : str
Tmp directory. Tmp directory.
""" """
with open(filename, 'rb') as fileobj: with open(filepath, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
@ -150,7 +152,7 @@ class PDFHandler(object):
tables = [] tables = []
with TemporaryDirectory() as tempdir: with TemporaryDirectory() as tempdir:
for p in self.pages: for p in self.pages:
self._save_page(self.filename, p, tempdir) self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p)) pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages] for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)

View File

@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Parameters Parameters
---------- ----------
filepath : str filepath : str
Path to PDF file. Filepath or URL of the PDF file.
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'. Example: '1,3,4' or '1,4-end'.

View File

@ -1,12 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import os
import sys
import random
import shutil import shutil
import string
import tempfile import tempfile
import warnings import warnings
from itertools import groupby from itertools import groupby
from operator import itemgetter from operator import itemgetter
import numpy as np import numpy as np
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -18,6 +23,77 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical) LTTextLineVertical)
PY3 = sys.version_info[0] >= 3
if PY3:
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
else:
from urllib2 import urlopen
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If url has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def random_string(length):
ret = ''
while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
length -= 1
return ret
def download_url(url):
"""Download file from specified URL.
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
"""
filename = '{}.pdf'.format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
obj = urlopen(url)
if PY3:
content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader('Content-Type')
if content_type != 'application/pdf':
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
stream_kwargs = [ stream_kwargs = [
'columns', 'columns',
'row_tol', 'row_tol',

View File

@ -207,6 +207,14 @@ def test_repr():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_url():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_arabic(): def test_arabic():
df = pd.DataFrame(data_arabic) df = pd.DataFrame(data_arabic)