add support for file_bytes argument with managed_file_context()
parent
644bbe7c6d
commit
3d27547477
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from contextlib import contextmanager
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
|
@ -8,12 +10,13 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
|
|||
from .core import TableList
|
||||
from .parsers import Stream, Lattice
|
||||
from .utils import (
|
||||
InvalidArguments,
|
||||
TemporaryDirectory,
|
||||
get_page_layout,
|
||||
get_text_objects,
|
||||
get_rotation,
|
||||
is_url,
|
||||
download_url,
|
||||
get_url_bytes,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -24,19 +27,33 @@ class PDFHandler(object):
|
|||
|
||||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
filepath : str | pathlib.Path, optional (default: None)
|
||||
Filepath or URL of the PDF file. Required if file_bytes is not given
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||
password : str, optional (default: None)
|
||||
Password for decryption.
|
||||
file_bytes : io.IOBase, optional (default: None)
|
||||
A file-like stream. Required if filepath is not given
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, filepath, pages="1", password=None):
|
||||
def __init__(self, filepath=None, pages="1", password=None, file_bytes=None):
|
||||
if is_url(filepath):
|
||||
filepath = download_url(filepath)
|
||||
file_bytes = get_url_bytes(filepath)
|
||||
|
||||
if not filepath and not file_bytes:
|
||||
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
|
||||
if not filepath:
|
||||
# filepath must either be passed, or taken from the name attribute
|
||||
filepath = getattr(file_bytes, 'name')
|
||||
if not filepath:
|
||||
msg = ('Either pass a `filepath`, or give the '
|
||||
'`file_bytes` argument a name attribute')
|
||||
raise InvalidArguments(msg)
|
||||
self.file_bytes = file_bytes # ok to be None
|
||||
|
||||
self.filepath = filepath
|
||||
if not filepath.lower().endswith(".pdf"):
|
||||
raise NotImplementedError("File format not supported")
|
||||
|
|
@ -49,6 +66,28 @@ class PDFHandler(object):
|
|||
self.password = self.password.encode("ascii")
|
||||
self.pages = self._get_pages(pages)
|
||||
|
||||
@contextmanager
|
||||
def managed_file_context(self):
|
||||
"""Reads from either the `filepath` or `file_bytes`
|
||||
attribute of this instance, to return a file-like object.
|
||||
Closes any open file handles on exit or error.
|
||||
|
||||
Returns
|
||||
-------
|
||||
file_bytes : io.IOBase
|
||||
A readable, seekable, file-like object
|
||||
"""
|
||||
if self.file_bytes:
|
||||
# if we can't seek, write to a BytesIO object that can,
|
||||
# then seek to the beginning before yielding
|
||||
if not hasattr(self.file_bytes, 'seek'):
|
||||
self.file_bytes = io.BytesIO(self.file_bytes.read())
|
||||
self.file_bytes.seek(0)
|
||||
yield self.file_bytes
|
||||
else:
|
||||
with open(self.filepath, "rb") as file_bytes:
|
||||
yield file_bytes
|
||||
|
||||
def _get_pages(self, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
||||
|
|
@ -71,7 +110,7 @@ class PDFHandler(object):
|
|||
if pages == "1":
|
||||
page_numbers.append({"start": 1, "end": 1})
|
||||
else:
|
||||
with open(self.filepath, "rb") as f:
|
||||
with self.managed_file_context() as f:
|
||||
infile = PdfFileReader(f, strict=False)
|
||||
|
||||
if infile.isEncrypted:
|
||||
|
|
@ -107,7 +146,7 @@ class PDFHandler(object):
|
|||
Tmp directory.
|
||||
|
||||
"""
|
||||
with open(filepath, "rb") as fileobj:
|
||||
with self.managed_file_context() as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
|
|
|
|||
|
|
@ -3,16 +3,21 @@
|
|||
import warnings
|
||||
|
||||
from .handlers import PDFHandler
|
||||
from .utils import validate_input, remove_extra
|
||||
from .utils import (
|
||||
InvalidArguments,
|
||||
validate_input,
|
||||
remove_extra,
|
||||
)
|
||||
|
||||
|
||||
def read_pdf(
|
||||
filepath,
|
||||
filepath=None,
|
||||
pages="1",
|
||||
password=None,
|
||||
flavor="lattice",
|
||||
suppress_stdout=False,
|
||||
layout_kwargs={},
|
||||
file_bytes=None,
|
||||
**kwargs
|
||||
):
|
||||
"""Read PDF and return extracted tables.
|
||||
|
|
@ -22,8 +27,8 @@ def read_pdf(
|
|||
|
||||
Parameters
|
||||
----------
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
filepath : str | pathlib.Path, optional (default: None)
|
||||
Filepath or URL of the PDF file. Required if file_bytes is not given
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||
|
|
@ -34,6 +39,8 @@ def read_pdf(
|
|||
Lattice is used by default.
|
||||
suppress_stdout : bool, optional (default: True)
|
||||
Print all logs and warnings.
|
||||
file_bytes : io.IOBase, optional (default: None)
|
||||
A file-like stream. Required if filepath is not given
|
||||
layout_kwargs : dict, optional (default: {})
|
||||
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
|
||||
table_areas : list, optional (default: None)
|
||||
|
|
@ -103,12 +110,15 @@ def read_pdf(
|
|||
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||
)
|
||||
|
||||
if not filepath and not file_bytes:
|
||||
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
|
||||
|
||||
with warnings.catch_warnings():
|
||||
if suppress_stdout:
|
||||
warnings.simplefilter("ignore")
|
||||
|
||||
validate_input(kwargs, flavor=flavor)
|
||||
p = PDFHandler(filepath, pages=pages, password=password)
|
||||
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
|
||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||
tables = p.parse(
|
||||
flavor=flavor,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import random
|
||||
import shutil
|
||||
|
|
@ -36,6 +36,10 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
|||
_VALID_URLS.discard("")
|
||||
|
||||
|
||||
class InvalidArguments(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
|
||||
def is_url(url):
|
||||
"""Check to see if a URL has a valid protocol.
|
||||
|
|
@ -66,8 +70,8 @@ def random_string(length):
|
|||
return ret
|
||||
|
||||
|
||||
def download_url(url):
|
||||
"""Download file from specified URL.
|
||||
def get_url_bytes(url):
|
||||
"""Get a stream of bytes for url
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -75,22 +79,21 @@ def download_url(url):
|
|||
|
||||
Returns
|
||||
-------
|
||||
filepath : str or unicode
|
||||
Temporary filepath.
|
||||
file_bytes : io.BytesIO
|
||||
a file-like object that cane be read
|
||||
|
||||
"""
|
||||
filename = f"{random_string(6)}.pdf"
|
||||
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
request = Request(url, None, headers)
|
||||
obj = urlopen(request)
|
||||
content_type = obj.info().get_content_type()
|
||||
if content_type != "application/pdf":
|
||||
raise NotImplementedError("File format not supported")
|
||||
f.write(obj.read())
|
||||
filepath = os.path.join(os.path.dirname(f.name), filename)
|
||||
shutil.move(f.name, filepath)
|
||||
return filepath
|
||||
file_bytes = io.BytesIO()
|
||||
file_bytes.name = url
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
request = Request(url, data=None, headers=headers)
|
||||
obj = urlopen(request)
|
||||
content_type = obj.info().get_content_type()
|
||||
if content_type != "application/pdf":
|
||||
raise NotImplementedError("File format not supported")
|
||||
file_bytes.write(obj.read())
|
||||
file_bytes.seek(0)
|
||||
return file_bytes
|
||||
|
||||
|
||||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
|
@ -172,3 +173,20 @@ def test_handler_pages_generator():
|
|||
|
||||
handler = PDFHandler(filename)
|
||||
assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]
|
||||
|
||||
|
||||
def test_from_open():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
with open(filename, "rb") as file_bytes:
|
||||
tables = camelot.read_pdf(file_bytes=file_bytes)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
|
||||
def test_from_bytes():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
file_bytes = io.BytesIO()
|
||||
with open(filename, "rb") as f:
|
||||
file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler
|
||||
tables = camelot.read_pdf(file_bytes=file_bytes)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
|
|
|
|||
Loading…
Reference in New Issue