Merge pull request #236 from socialcopsdev/read_url

[MRG] Add support to read from url
pull/2/head
Vinayak Mehta 2018-12-24 13:29:41 +05:30 committed by GitHub
commit 0b85c77425
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 18 deletions

View File

@ -6,6 +6,7 @@ master
**Improvements**
* [#91](https://github.com/socialcopsdev/camelot/issues/91) Add support to read from url. [#236](https://github.com/socialcopsdev/camelot/pull/236) by Vinayak Mehta.
* [#229](https://github.com/socialcopsdev/camelot/issues/229), [#230](https://github.com/socialcopsdev/camelot/issues/230) and [#233](https://github.com/socialcopsdev/camelot/issues/233) New configuration parameters. [#234](https://github.com/socialcopsdev/camelot/pull/234) by Vinayak Mehta.
* `strip_text`: To define characters that should be stripped from each string.
* `edge_tol`: Tolerance parameter for extending textedges vertically.

View File

@ -8,7 +8,7 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice
from .utils import (TemporaryDirectory, get_page_layout, get_text_objects,
get_rotation)
get_rotation, is_url, download_url)
class PDFHandler(object):
@ -18,8 +18,8 @@ class PDFHandler(object):
Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
@ -27,11 +27,13 @@ class PDFHandler(object):
Password for decryption.
"""
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
def __init__(self, filepath, pages='1', password=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath = filepath
if not filepath.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
self.pages = self._get_pages(self.filepath, pages)
if password is None:
self.password = ''
else:
@ -39,13 +41,13 @@ class PDFHandler(object):
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
def _get_pages(self, filename, pages):
def _get_pages(self, filepath, pages):
"""Converts pages string to list of ints.
Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
@ -60,7 +62,7 @@ class PDFHandler(object):
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
@ -79,20 +81,20 @@ class PDFHandler(object):
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P))
def _save_page(self, filename, page, temp):
def _save_page(self, filepath, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filename : str
Path to PDF file.
filepath : str
Filepath or URL of the PDF file.
page : int
Page number.
temp : str
Tmp directory.
"""
with open(filename, 'rb') as fileobj:
with open(filepath, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
@ -150,7 +152,7 @@ class PDFHandler(object):
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filename, p, tempdir)
self._save_page(self.filepath, p, tempdir)
pages = [os.path.join(tempdir, 'page-{0}.pdf'.format(p))
for p in self.pages]
parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs)

View File

@ -15,7 +15,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
Parameters
----------
filepath : str
Path to PDF file.
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.

View File

@ -1,12 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import division
import os
import sys
import random
import shutil
import string
import tempfile
import warnings
from itertools import groupby
from operator import itemgetter
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
@ -18,6 +23,77 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
PY3 = sys.version_info[0] >= 3
if PY3:
from urllib.request import urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_relative, uses_netloc, uses_params
else:
from urllib2 import urlopen
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If url has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def random_string(length):
ret = ''
while length:
ret += random.choice(string.digits + string.ascii_lowercase + string.ascii_uppercase)
length -= 1
return ret
def download_url(url):
"""Download file from specified URL.
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
"""
filename = '{}.pdf'.format(random_string(6))
with tempfile.NamedTemporaryFile('wb', delete=False) as f:
obj = urlopen(url)
if PY3:
content_type = obj.info().get_content_type()
else:
content_type = obj.info().getheader('Content-Type')
if content_type != 'application/pdf':
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
stream_kwargs = [
'columns',
'row_tol',

View File

@ -207,6 +207,14 @@ def test_repr():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_url():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_arabic():
df = pd.DataFrame(data_arabic)