[MRG + 1] Add basic support for encrypted PDF files (#180)

* [MRG] Add basic support for encrypted PDF files

Update API and CLI to accept ASCII passwords to decrypt PDFs
encrypted by algorithm code 1 or 2 (limited by support from PyPDF2).
Update documentation and unit tests accordingly.

Example document health_protected.pdf generated as follows:
qpdf --encrypt userpass ownerpass 128 -- health.pdf health_protected.pdf

Issue #162

* Support encrypted PDF files in python3

Issue #162

* Address review comments

Explicitly check passwords for None rather than falsey.
Correct read_pdf documentation for Owner/User password.

Issue #162

* Correct API documentation changes for consistency

Issue #162

* Move error tests from test_common to test_errors

Issue #162

* Add qpdf example

* Remove password is not None check

* Fix merge conflict

* Fix pages example
pull/2/head
rbares 2018-10-28 16:31:10 +00:00 committed by Vinayak Mehta
parent 4366313484
commit 429640feea
9 changed files with 116 additions and 29 deletions

View File

@ -27,6 +27,7 @@ pass_config = click.make_pass_decorator(Config)
@click.version_option(version=__version__)
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
' Example: 1,3,4 or 1,4-end.')
@click.option('-pw', '--password', help='Password for decryption.')
@click.option('-o', '--output', help='Output file path.')
@click.option('-f', '--format',
type=click.Choice(['csv', 'json', 'excel', 'html']),

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import os
import sys
from PyPDF2 import PdfFileReader, PdfFileWriter
@ -21,14 +22,22 @@ class PDFHandler(object):
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.
"""
def __init__(self, filename, pages='1'):
def __init__(self, filename, pages='1', password=None):
self.filename = filename
if not filename.lower().endswith('.pdf'):
raise NotImplementedError("File format not supported")
self.pages = self._get_pages(self.filename, pages)
if password is None:
self.password = ''
else:
self.password = password
if sys.version_info[0] < 3:
self.password = self.password.encode('ascii')
def _get_pages(self, filename, pages):
"""Converts pages string to list of ints.
@ -52,6 +61,8 @@ class PDFHandler(object):
page_numbers.append({'start': 1, 'end': 1})
else:
infile = PdfFileReader(open(filename, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
else:
@ -84,7 +95,7 @@ class PDFHandler(object):
with open(filename, 'rb') as fileobj:
infile = PdfFileReader(fileobj, strict=False)
if infile.isEncrypted:
infile.decrypt('')
infile.decrypt(self.password)
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
froot, fext = os.path.splitext(fpath)
p = infile.getPage(page - 1)
@ -103,7 +114,7 @@ class PDFHandler(object):
os.rename(fpath, fpath_new)
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
if infile.isEncrypted:
infile.decrypt('')
infile.decrypt(self.password)
outfile = PdfFileWriter()
p = infile.getPage(0)
if rotation == 'anticlockwise':

View File

@ -5,8 +5,8 @@ from .handlers import PDFHandler
from .utils import validate_input, remove_extra
def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
**kwargs):
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
suppress_warnings=False, **kwargs):
"""Read PDF and return extracted tables.
Note: kwargs annotated with ^ can only be used with flavor='stream'
@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end'.
password : str, optional (default: None)
Password for decryption.
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
Lattice is used by default.
@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
warnings.simplefilter("ignore")
validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages)
p = PDFHandler(filepath, pages=pages, password=password)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(flavor=flavor, **kwargs)
return tables

View File

@ -9,14 +9,15 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
::
Usage: camelot [OPTIONS] COMMAND [ARGS]...
Usage: camelot [OPTIONS] COMMAND [ARGS]...
Camelot: PDF Table Extraction for Humans
Options:
Options:
--version Show the version and exit.
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
or 1,4-end.
-pw, --password TEXT Password for decryption.
-o, --output TEXT Output file path.
-f, --format [csv|json|excel|html]
Output file format.
@ -27,8 +28,9 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
-M, --margins <FLOAT FLOAT FLOAT>...
PDFMiner char_margin, line_margin and
word_margin.
-q, --quiet Suppress warnings.
--help Show this message and exit.
Commands:
Commands:
lattice Use lines between text to parse the table.
stream Use spaces between text to parse the table.

View File

@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
------------------------
Reading encrypted PDFs
----------------------
To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() <camelot.read_pdf>`.
::
>>> tables = camelot.read_pdf('foo.pdf', password='userpass')
>>> tables
<TableList n=1>
Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm.
Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() <camelot.read_pdf>`. This can been successfully achieved with third-party tools such as `QPDF`_.
::
$ qpdf --password=<PASSWORD> --decrypt input.pdf output.pdf
.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378
.. _QPDF: https://www.github.com/qpdf/qpdf
----
Ready for more? Check out the :ref:`advanced <advanced>` section.

Binary file not shown.

View File

@ -52,6 +52,30 @@ def test_cli_stream():
assert format_error in result.output
def test_cli_password():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'health_protected.pdf')
outfile = os.path.join(tempdir, 'health_protected.csv')
runner = CliRunner()
result = runner.invoke(cli, ['--password', 'userpass',
'--format', 'csv', '--output', outfile,
'stream', infile])
assert result.exit_code == 0
assert result.output == 'Found 1 tables\n'
output_error = 'file has not been decrypted'
# no password
result = runner.invoke(cli, ['--format', 'csv', '--output', outfile,
'stream', infile])
assert output_error in str(result.exception)
# bad password
result = runner.invoke(cli, ['--password', 'wrongpass',
'--format', 'csv', '--output', outfile,
'stream', infile])
assert output_error in str(result.exception)
def test_cli_output_format():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'health.pdf')
@ -78,7 +102,7 @@ def test_cli_output_format():
'stream', infile])
assert result.exit_code == 0
def test_cli_quiet_flag():
def test_cli_quiet():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, 'blank.pdf')
outfile = os.path.join(tempdir, 'blank.csv')

View File

@ -25,6 +25,17 @@ def test_parsing_report():
assert tables[0].parsing_report == parsing_report
def test_password():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
assert df.equals(tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert df.equals(tables[0].df)
def test_stream():
df = pd.DataFrame(data_stream)

View File

@ -75,3 +75,17 @@ def test_ghostscript_not_found(monkeypatch):
filename = os.path.join(testdir, 'foo.pdf')
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename)
def test_no_password():
filename = os.path.join(testdir, 'health_protected.pdf')
message = 'file has not been decrypted'
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename)
def test_bad_password():
filename = os.path.join(testdir, 'health_protected.pdf')
message = 'file has not been decrypted'
with pytest.raises(Exception, message=message):
tables = camelot.read_pdf(filename, password='wrongpass')