add support for file_bytes argument with managed_file_context()

2021-10-08 00:13:00 -07:00 · 2021-10-08 00:13:00 -07:00 · 3d27547477
parent 644bbe7c6d
commit 3d27547477
4 changed files with 99 additions and 29 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-

+from contextlib import contextmanager
+import io
 import os
 import sys

@ -8,12 +10,13 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
 from .core import TableList
 from .parsers import Stream, Lattice
 from .utils import (
+    InvalidArguments,
    TemporaryDirectory,
    get_page_layout,
    get_text_objects,
    get_rotation,
    is_url,
-    download_url,
+    get_url_bytes,
 )


@ -24,19 +27,33 @@ class PDFHandler(object):

    Parameters
    ----------
-    filepath : str
-        Filepath or URL of the PDF file.
+    filepath : str | pathlib.Path, optional (default: None)
+        Filepath or URL of the PDF file. Required if file_bytes is not given
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
+    file_bytes : io.IOBase, optional (default: None)
+        A file-like stream. Required if filepath is not given

    """

-    def __init__(self, filepath, pages="1", password=None):
+    def __init__(self, filepath=None, pages="1", password=None, file_bytes=None):
        if is_url(filepath):
-            filepath = download_url(filepath)
+            file_bytes = get_url_bytes(filepath)
+
+        if not filepath and not file_bytes:
+            raise InvalidArguments('Either `filepath` or `file_bytes` is required')
+        if not filepath:
+            # filepath must either be passed, or taken from the name attribute
+            filepath = getattr(file_bytes, 'name')
+            if not filepath:
+                msg = ('Either pass a `filepath`, or give the '
+                       '`file_bytes` argument a name attribute')
+                raise InvalidArguments(msg)
+        self.file_bytes = file_bytes  # ok to be None
+
        self.filepath = filepath
        if not filepath.lower().endswith(".pdf"):
            raise NotImplementedError("File format not supported")
@ -49,6 +66,28 @@ class PDFHandler(object):
                self.password = self.password.encode("ascii")
        self.pages = self._get_pages(pages)

+    @contextmanager
+    def managed_file_context(self):
+        """Reads from either the `filepath` or `file_bytes`
+        attribute of this instance, to return a file-like object.
+        Closes any open file handles on exit or error.
+
+        Returns
+        -------
+        file_bytes : io.IOBase
+            A readable, seekable, file-like object
+        """
+        if self.file_bytes:
+            # if we can't seek, write to a BytesIO object that can,
+            # then seek to the beginning before yielding
+            if not hasattr(self.file_bytes, 'seek'):
+                self.file_bytes = io.BytesIO(self.file_bytes.read())
+            self.file_bytes.seek(0)
+            yield self.file_bytes
+        else:
+            with open(self.filepath, "rb") as file_bytes:
+                yield file_bytes
+
    def _get_pages(self, pages):
        """Converts pages string to list of ints.

@ -71,7 +110,7 @@ class PDFHandler(object):
        if pages == "1":
            page_numbers.append({"start": 1, "end": 1})
        else:
-            with open(self.filepath, "rb") as f:
+            with self.managed_file_context() as f:
                infile = PdfFileReader(f, strict=False)

                if infile.isEncrypted:
@ -107,7 +146,7 @@ class PDFHandler(object):
            Tmp directory.

        """
-        with open(filepath, "rb") as fileobj:
+        with self.managed_file_context() as fileobj:
            infile = PdfFileReader(fileobj, strict=False)
            if infile.isEncrypted:
                infile.decrypt(self.password)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -3,16 +3,21 @@
 import warnings

 from .handlers import PDFHandler
-from .utils import validate_input, remove_extra
+from .utils import (
+    InvalidArguments,
+    validate_input,
+    remove_extra,
+)


 def read_pdf(
-    filepath,
+    filepath=None,
    pages="1",
    password=None,
    flavor="lattice",
    suppress_stdout=False,
    layout_kwargs={},
+    file_bytes=None,
    **kwargs
 ):
    """Read PDF and return extracted tables.
@ -22,8 +27,8 @@ def read_pdf(

    Parameters
    ----------
-    filepath : str
-        Filepath or URL of the PDF file.
+    filepath : str | pathlib.Path, optional (default: None)
+        Filepath or URL of the PDF file. Required if file_bytes is not given
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
@ -34,6 +39,8 @@ def read_pdf(
        Lattice is used by default.
    suppress_stdout : bool, optional (default: True)
        Print all logs and warnings.
+    file_bytes : io.IOBase, optional (default: None)
+        A file-like stream. Required if filepath is not given
    layout_kwargs : dict, optional (default: {})
        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
    table_areas : list, optional (default: None)
@ -103,12 +110,15 @@ def read_pdf(
            "Unknown flavor specified." " Use either 'lattice' or 'stream'"
        )

+    if not filepath and not file_bytes:
+        raise InvalidArguments('Either `filepath` or `file_bytes` is required')
+
    with warnings.catch_warnings():
        if suppress_stdout:
            warnings.simplefilter("ignore")

        validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-import os
+import io
 import re
 import random
 import shutil
@ -36,6 +36,10 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard("")


+class InvalidArguments(Exception):
+    pass
+
+
 # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
 def is_url(url):
    """Check to see if a URL has a valid protocol.
@ -66,8 +70,8 @@ def random_string(length):
    return ret


-def download_url(url):
-    """Download file from specified URL.
+def get_url_bytes(url):
+    """Get a stream of bytes for url

    Parameters
    ----------
@ -75,22 +79,21 @@ def download_url(url):

    Returns
    -------
-    filepath : str or unicode
-        Temporary filepath.
+    file_bytes : io.BytesIO
+        a file-like object that cane be read

    """
-    filename = f"{random_string(6)}.pdf"
-    with tempfile.NamedTemporaryFile("wb", delete=False) as f:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        request = Request(url, None, headers)
-        obj = urlopen(request)
-        content_type = obj.info().get_content_type()
-        if content_type != "application/pdf":
-            raise NotImplementedError("File format not supported")
-        f.write(obj.read())
-    filepath = os.path.join(os.path.dirname(f.name), filename)
-    shutil.move(f.name, filepath)
-    return filepath
+    file_bytes = io.BytesIO()
+    file_bytes.name = url
+    headers = {"User-Agent": "Mozilla/5.0"}
+    request = Request(url, data=None, headers=headers)
+    obj = urlopen(request)
+    content_type = obj.info().get_content_type()
+    if content_type != "application/pdf":
+        raise NotImplementedError("File format not supported")
+    file_bytes.write(obj.read())
+    file_bytes.seek(0)
+    return file_bytes


 stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-

+import io
 import os
 import sys

@ -172,3 +173,20 @@ def test_handler_pages_generator():

    handler = PDFHandler(filename)
    assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]
+
+
+def test_from_open():
+    filename = os.path.join(testdir, "foo.pdf")
+    with open(filename, "rb") as file_bytes:
+        tables = camelot.read_pdf(file_bytes=file_bytes)
+        assert repr(tables) == "<TableList n=1>"
+        assert repr(tables[0]) == "<Table shape=(7, 7)>"
+
+def test_from_bytes():
+    filename = os.path.join(testdir, "foo.pdf")
+    file_bytes = io.BytesIO()
+    with open(filename, "rb") as f:
+        file_bytes.write(f.read())  # note that we didn't seek, done by PDFHandler
+    tables = camelot.read_pdf(file_bytes=file_bytes)
+    assert repr(tables) == "<TableList n=1>"
+    assert repr(tables[0]) == "<Table shape=(7, 7)>"