pull/270/merge
Chris Scanlin 2021-10-08 07:17:07 +00:00 committed by GitHub
commit b9d21f5abf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 99 additions and 29 deletions

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from contextlib import contextmanager
import io
import os import os
import sys import sys
@ -8,12 +10,13 @@ from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList from .core import TableList
from .parsers import Stream, Lattice from .parsers import Stream, Lattice
from .utils import ( from .utils import (
InvalidArguments,
TemporaryDirectory, TemporaryDirectory,
get_page_layout, get_page_layout,
get_text_objects, get_text_objects,
get_rotation, get_rotation,
is_url, is_url,
download_url, get_url_bytes,
) )
@ -24,19 +27,33 @@ class PDFHandler(object):
Parameters Parameters
---------- ----------
filepath : str filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'. Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None) password : str, optional (default: None)
Password for decryption. Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
""" """
def __init__(self, filepath, pages="1", password=None): def __init__(self, filepath=None, pages="1", password=None, file_bytes=None):
if is_url(filepath): if is_url(filepath):
filepath = download_url(filepath) file_bytes = get_url_bytes(filepath)
if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
if not filepath:
# filepath must either be passed, or taken from the name attribute
filepath = getattr(file_bytes, 'name')
if not filepath:
msg = ('Either pass a `filepath`, or give the '
'`file_bytes` argument a name attribute')
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None
self.filepath = filepath self.filepath = filepath
if not filepath.lower().endswith(".pdf"): if not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
@ -49,6 +66,28 @@ class PDFHandler(object):
self.password = self.password.encode("ascii") self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages) self.pages = self._get_pages(pages)
@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.
Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, 'seek'):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes
def _get_pages(self, pages): def _get_pages(self, pages):
"""Converts pages string to list of ints. """Converts pages string to list of ints.
@ -71,7 +110,7 @@ class PDFHandler(object):
if pages == "1": if pages == "1":
page_numbers.append({"start": 1, "end": 1}) page_numbers.append({"start": 1, "end": 1})
else: else:
with open(self.filepath, "rb") as f: with self.managed_file_context() as f:
infile = PdfFileReader(f, strict=False) infile = PdfFileReader(f, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
@ -107,7 +146,7 @@ class PDFHandler(object):
Tmp directory. Tmp directory.
""" """
with open(filepath, "rb") as fileobj: with self.managed_file_context() as fileobj:
infile = PdfFileReader(fileobj, strict=False) infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)

View File

@ -3,16 +3,21 @@
import warnings import warnings
from .handlers import PDFHandler from .handlers import PDFHandler
from .utils import validate_input, remove_extra from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)
def read_pdf( def read_pdf(
filepath, filepath=None,
pages="1", pages="1",
password=None, password=None,
flavor="lattice", flavor="lattice",
suppress_stdout=False, suppress_stdout=False,
layout_kwargs={}, layout_kwargs={},
file_bytes=None,
**kwargs **kwargs
): ):
"""Read PDF and return extracted tables. """Read PDF and return extracted tables.
@ -22,8 +27,8 @@ def read_pdf(
Parameters Parameters
---------- ----------
filepath : str filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1') pages : str, optional (default: '1')
Comma-separated page numbers. Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'. Example: '1,3,4' or '1,4-end' or 'all'.
@ -34,6 +39,8 @@ def read_pdf(
Lattice is used by default. Lattice is used by default.
suppress_stdout : bool, optional (default: True) suppress_stdout : bool, optional (default: True)
Print all logs and warnings. Print all logs and warnings.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {}) layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs. A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
table_areas : list, optional (default: None) table_areas : list, optional (default: None)
@ -103,12 +110,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'" "Unknown flavor specified." " Use either 'lattice' or 'stream'"
) )
if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
with warnings.catch_warnings(): with warnings.catch_warnings():
if suppress_stdout: if suppress_stdout:
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor) validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password) p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor) kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse( tables = p.parse(
flavor=flavor, flavor=flavor,

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import io
import re import re
import random import random
import shutil import shutil
@ -36,6 +36,10 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("") _VALID_URLS.discard("")
class InvalidArguments(Exception):
pass
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url): def is_url(url):
"""Check to see if a URL has a valid protocol. """Check to see if a URL has a valid protocol.
@ -66,8 +70,8 @@ def random_string(length):
return ret return ret
def download_url(url): def get_url_bytes(url):
"""Download file from specified URL. """Get a stream of bytes for url
Parameters Parameters
---------- ----------
@ -75,22 +79,21 @@ def download_url(url):
Returns Returns
------- -------
filepath : str or unicode file_bytes : io.BytesIO
Temporary filepath. a file-like object that cane be read
""" """
filename = f"{random_string(6)}.pdf" file_bytes = io.BytesIO()
with tempfile.NamedTemporaryFile("wb", delete=False) as f: file_bytes.name = url
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, None, headers) request = Request(url, data=None, headers=headers)
obj = urlopen(request) obj = urlopen(request)
content_type = obj.info().get_content_type() content_type = obj.info().get_content_type()
if content_type != "application/pdf": if content_type != "application/pdf":
raise NotImplementedError("File format not supported") raise NotImplementedError("File format not supported")
f.write(obj.read()) file_bytes.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename) file_bytes.seek(0)
shutil.move(f.name, filepath) return file_bytes
return filepath
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import io
import os import os
import sys import sys
@ -172,3 +173,20 @@ def test_handler_pages_generator():
handler = PDFHandler(filename) handler = PDFHandler(filename)
assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10] assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]
def test_from_open():
filename = os.path.join(testdir, "foo.pdf")
with open(filename, "rb") as file_bytes:
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
def test_from_bytes():
filename = os.path.join(testdir, "foo.pdf")
file_bytes = io.BytesIO()
with open(filename, "rb") as f:
file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"