[MRG + 1] Add basic support for encrypted PDF files (#180)
* [MRG] Add basic support for encrypted PDF files Update API and CLI to accept ASCII passwords to decrypt PDFs encrypted by algorithm code 1 or 2 (limited by support from PyPDF2). Update documentation and unit tests accordingly. Example document health_protected.pdf generated as follows: qpdf --encrypt userpass ownerpass 128 -- health.pdf health_protected.pdf Issue #162 * Support encrypted PDF files in python3 Issue #162 * Address review comments Explicitly check passwords for None rather than falsey. Correct read_pdf documentation for Owner/User password. Issue #162 * Correct API documentation changes for consistency Issue #162 * Move error tests from test_common to test_errors Issue #162 * Add qpdf example * Remove password is not None check * Fix merge conflict * Fix pages examplepull/2/head
parent
4366313484
commit
429640feea
|
|
@ -27,6 +27,7 @@ pass_config = click.make_pass_decorator(Config)
|
|||
@click.version_option(version=__version__)
|
||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||
' Example: 1,3,4 or 1,4-end.')
|
||||
@click.option('-pw', '--password', help='Password for decryption.')
|
||||
@click.option('-o', '--output', help='Output file path.')
|
||||
@click.option('-f', '--format',
|
||||
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||
|
||||
|
|
@ -21,14 +22,22 @@ class PDFHandler(object):
|
|||
Path to PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: 1,3,4 or 1,4-end.
|
||||
Example: '1,3,4' or '1,4-end'.
|
||||
password : str, optional (default: None)
|
||||
Password for decryption.
|
||||
|
||||
"""
|
||||
def __init__(self, filename, pages='1'):
|
||||
def __init__(self, filename, pages='1', password=None):
|
||||
self.filename = filename
|
||||
if not filename.lower().endswith('.pdf'):
|
||||
raise NotImplementedError("File format not supported")
|
||||
self.pages = self._get_pages(self.filename, pages)
|
||||
if password is None:
|
||||
self.password = ''
|
||||
else:
|
||||
self.password = password
|
||||
if sys.version_info[0] < 3:
|
||||
self.password = self.password.encode('ascii')
|
||||
|
||||
def _get_pages(self, filename, pages):
|
||||
"""Converts pages string to list of ints.
|
||||
|
|
@ -52,6 +61,8 @@ class PDFHandler(object):
|
|||
page_numbers.append({'start': 1, 'end': 1})
|
||||
else:
|
||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt(self.password)
|
||||
if pages == 'all':
|
||||
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
||||
else:
|
||||
|
|
@ -84,7 +95,7 @@ class PDFHandler(object):
|
|||
with open(filename, 'rb') as fileobj:
|
||||
infile = PdfFileReader(fileobj, strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt('')
|
||||
infile.decrypt(self.password)
|
||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||
froot, fext = os.path.splitext(fpath)
|
||||
p = infile.getPage(page - 1)
|
||||
|
|
@ -103,7 +114,7 @@ class PDFHandler(object):
|
|||
os.rename(fpath, fpath_new)
|
||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||
if infile.isEncrypted:
|
||||
infile.decrypt('')
|
||||
infile.decrypt(self.password)
|
||||
outfile = PdfFileWriter()
|
||||
p = infile.getPage(0)
|
||||
if rotation == 'anticlockwise':
|
||||
|
|
|
|||
|
|
@ -5,8 +5,8 @@ from .handlers import PDFHandler
|
|||
from .utils import validate_input, remove_extra
|
||||
|
||||
|
||||
def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
||||
**kwargs):
|
||||
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||
suppress_warnings=False, **kwargs):
|
||||
"""Read PDF and return extracted tables.
|
||||
|
||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||
|
|
@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
|||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end'.
|
||||
password : str, optional (default: None)
|
||||
Password for decryption.
|
||||
flavor : str (default: 'lattice')
|
||||
The parsing method to use ('lattice' or 'stream').
|
||||
Lattice is used by default.
|
||||
|
|
@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
|||
warnings.simplefilter("ignore")
|
||||
|
||||
validate_input(kwargs, flavor=flavor)
|
||||
p = PDFHandler(filepath, pages)
|
||||
p = PDFHandler(filepath, pages=pages, password=password)
|
||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||
tables = p.parse(flavor=flavor, **kwargs)
|
||||
return tables
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
|
|||
--version Show the version and exit.
|
||||
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
|
||||
or 1,4-end.
|
||||
-pw, --password TEXT Password for decryption.
|
||||
-o, --output TEXT Output file path.
|
||||
-f, --format [csv|json|excel|html]
|
||||
Output file format.
|
||||
|
|
@ -27,6 +28,7 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
|
|||
-M, --margins <FLOAT FLOAT FLOAT>...
|
||||
PDFMiner char_margin, line_margin and
|
||||
word_margin.
|
||||
-q, --quiet Suppress warnings.
|
||||
--help Show this message and exit.
|
||||
|
||||
Commands:
|
||||
|
|
|
|||
|
|
@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
|
|||
|
||||
The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
|
||||
|
||||
------------------------
|
||||
Reading encrypted PDFs
|
||||
----------------------
|
||||
|
||||
To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() <camelot.read_pdf>`.
|
||||
|
||||
::
|
||||
|
||||
>>> tables = camelot.read_pdf('foo.pdf', password='userpass')
|
||||
>>> tables
|
||||
<TableList n=1>
|
||||
|
||||
Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm.
|
||||
|
||||
Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() <camelot.read_pdf>`. This can been successfully achieved with third-party tools such as `QPDF`_.
|
||||
|
||||
::
|
||||
|
||||
$ qpdf --password=<PASSWORD> --decrypt input.pdf output.pdf
|
||||
|
||||
.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378
|
||||
.. _QPDF: https://www.github.com/qpdf/qpdf
|
||||
|
||||
----
|
||||
|
||||
Ready for more? Check out the :ref:`advanced <advanced>` section.
|
||||
Binary file not shown.
|
|
@ -52,6 +52,30 @@ def test_cli_stream():
|
|||
assert format_error in result.output
|
||||
|
||||
|
||||
def test_cli_password():
|
||||
with TemporaryDirectory() as tempdir:
|
||||
infile = os.path.join(testdir, 'health_protected.pdf')
|
||||
outfile = os.path.join(tempdir, 'health_protected.csv')
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ['--password', 'userpass',
|
||||
'--format', 'csv', '--output', outfile,
|
||||
'stream', infile])
|
||||
assert result.exit_code == 0
|
||||
assert result.output == 'Found 1 tables\n'
|
||||
|
||||
output_error = 'file has not been decrypted'
|
||||
# no password
|
||||
result = runner.invoke(cli, ['--format', 'csv', '--output', outfile,
|
||||
'stream', infile])
|
||||
assert output_error in str(result.exception)
|
||||
|
||||
# bad password
|
||||
result = runner.invoke(cli, ['--password', 'wrongpass',
|
||||
'--format', 'csv', '--output', outfile,
|
||||
'stream', infile])
|
||||
assert output_error in str(result.exception)
|
||||
|
||||
|
||||
def test_cli_output_format():
|
||||
with TemporaryDirectory() as tempdir:
|
||||
infile = os.path.join(testdir, 'health.pdf')
|
||||
|
|
@ -78,7 +102,7 @@ def test_cli_output_format():
|
|||
'stream', infile])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_cli_quiet_flag():
|
||||
def test_cli_quiet():
|
||||
with TemporaryDirectory() as tempdir:
|
||||
infile = os.path.join(testdir, 'blank.pdf')
|
||||
outfile = os.path.join(tempdir, 'blank.csv')
|
||||
|
|
|
|||
|
|
@ -25,6 +25,17 @@ def test_parsing_report():
|
|||
assert tables[0].parsing_report == parsing_report
|
||||
|
||||
|
||||
def test_password():
|
||||
df = pd.DataFrame(data_stream)
|
||||
|
||||
filename = os.path.join(testdir, "health_protected.pdf")
|
||||
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
||||
assert df.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
||||
assert df.equals(tables[0].df)
|
||||
|
||||
|
||||
def test_stream():
|
||||
df = pd.DataFrame(data_stream)
|
||||
|
||||
|
|
|
|||
|
|
@ -75,3 +75,17 @@ def test_ghostscript_not_found(monkeypatch):
|
|||
filename = os.path.join(testdir, 'foo.pdf')
|
||||
with pytest.raises(Exception, message=message):
|
||||
tables = camelot.read_pdf(filename)
|
||||
|
||||
|
||||
def test_no_password():
|
||||
filename = os.path.join(testdir, 'health_protected.pdf')
|
||||
message = 'file has not been decrypted'
|
||||
with pytest.raises(Exception, message=message):
|
||||
tables = camelot.read_pdf(filename)
|
||||
|
||||
|
||||
def test_bad_password():
|
||||
filename = os.path.join(testdir, 'health_protected.pdf')
|
||||
message = 'file has not been decrypted'
|
||||
with pytest.raises(Exception, message=message):
|
||||
tables = camelot.read_pdf(filename, password='wrongpass')
|
||||
|
|
|
|||
Loading…
Reference in New Issue