[MRG + 1] Add basic support for encrypted PDF files (#180)
* [MRG] Add basic support for encrypted PDF files Update API and CLI to accept ASCII passwords to decrypt PDFs encrypted by algorithm code 1 or 2 (limited by support from PyPDF2). Update documentation and unit tests accordingly. Example document health_protected.pdf generated as follows: qpdf --encrypt userpass ownerpass 128 -- health.pdf health_protected.pdf Issue #162 * Support encrypted PDF files in python3 Issue #162 * Address review comments Explicitly check passwords for None rather than falsey. Correct read_pdf documentation for Owner/User password. Issue #162 * Correct API documentation changes for consistency Issue #162 * Move error tests from test_common to test_errors Issue #162 * Add qpdf example * Remove password is not None check * Fix merge conflict * Fix pages examplepull/2/head
parent
4366313484
commit
429640feea
|
|
@ -27,6 +27,7 @@ pass_config = click.make_pass_decorator(Config)
|
||||||
@click.version_option(version=__version__)
|
@click.version_option(version=__version__)
|
||||||
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.'
|
||||||
' Example: 1,3,4 or 1,4-end.')
|
' Example: 1,3,4 or 1,4-end.')
|
||||||
|
@click.option('-pw', '--password', help='Password for decryption.')
|
||||||
@click.option('-o', '--output', help='Output file path.')
|
@click.option('-o', '--output', help='Output file path.')
|
||||||
@click.option('-f', '--format',
|
@click.option('-f', '--format',
|
||||||
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
type=click.Choice(['csv', 'json', 'excel', 'html']),
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
|
|
@ -21,14 +22,22 @@ class PDFHandler(object):
|
||||||
Path to PDF file.
|
Path to PDF file.
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: 1,3,4 or 1,4-end.
|
Example: '1,3,4' or '1,4-end'.
|
||||||
|
password : str, optional (default: None)
|
||||||
|
Password for decryption.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, pages='1'):
|
def __init__(self, filename, pages='1', password=None):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
if not filename.lower().endswith('.pdf'):
|
if not filename.lower().endswith('.pdf'):
|
||||||
raise NotImplementedError("File format not supported")
|
raise NotImplementedError("File format not supported")
|
||||||
self.pages = self._get_pages(self.filename, pages)
|
self.pages = self._get_pages(self.filename, pages)
|
||||||
|
if password is None:
|
||||||
|
self.password = ''
|
||||||
|
else:
|
||||||
|
self.password = password
|
||||||
|
if sys.version_info[0] < 3:
|
||||||
|
self.password = self.password.encode('ascii')
|
||||||
|
|
||||||
def _get_pages(self, filename, pages):
|
def _get_pages(self, filename, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
@ -52,6 +61,8 @@ class PDFHandler(object):
|
||||||
page_numbers.append({'start': 1, 'end': 1})
|
page_numbers.append({'start': 1, 'end': 1})
|
||||||
else:
|
else:
|
||||||
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
infile = PdfFileReader(open(filename, 'rb'), strict=False)
|
||||||
|
if infile.isEncrypted:
|
||||||
|
infile.decrypt(self.password)
|
||||||
if pages == 'all':
|
if pages == 'all':
|
||||||
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
|
||||||
else:
|
else:
|
||||||
|
|
@ -84,7 +95,7 @@ class PDFHandler(object):
|
||||||
with open(filename, 'rb') as fileobj:
|
with open(filename, 'rb') as fileobj:
|
||||||
infile = PdfFileReader(fileobj, strict=False)
|
infile = PdfFileReader(fileobj, strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt('')
|
infile.decrypt(self.password)
|
||||||
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
fpath = os.path.join(temp, 'page-{0}.pdf'.format(page))
|
||||||
froot, fext = os.path.splitext(fpath)
|
froot, fext = os.path.splitext(fpath)
|
||||||
p = infile.getPage(page - 1)
|
p = infile.getPage(page - 1)
|
||||||
|
|
@ -103,7 +114,7 @@ class PDFHandler(object):
|
||||||
os.rename(fpath, fpath_new)
|
os.rename(fpath, fpath_new)
|
||||||
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
infile = PdfFileReader(open(fpath_new, 'rb'), strict=False)
|
||||||
if infile.isEncrypted:
|
if infile.isEncrypted:
|
||||||
infile.decrypt('')
|
infile.decrypt(self.password)
|
||||||
outfile = PdfFileWriter()
|
outfile = PdfFileWriter()
|
||||||
p = infile.getPage(0)
|
p = infile.getPage(0)
|
||||||
if rotation == 'anticlockwise':
|
if rotation == 'anticlockwise':
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,8 @@ from .handlers import PDFHandler
|
||||||
from .utils import validate_input, remove_extra
|
from .utils import validate_input, remove_extra
|
||||||
|
|
||||||
|
|
||||||
def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
def read_pdf(filepath, pages='1', password=None, flavor='lattice',
|
||||||
**kwargs):
|
suppress_warnings=False, **kwargs):
|
||||||
"""Read PDF and return extracted tables.
|
"""Read PDF and return extracted tables.
|
||||||
|
|
||||||
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
Note: kwargs annotated with ^ can only be used with flavor='stream'
|
||||||
|
|
@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
||||||
pages : str, optional (default: '1')
|
pages : str, optional (default: '1')
|
||||||
Comma-separated page numbers.
|
Comma-separated page numbers.
|
||||||
Example: '1,3,4' or '1,4-end'.
|
Example: '1,3,4' or '1,4-end'.
|
||||||
|
password : str, optional (default: None)
|
||||||
|
Password for decryption.
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice' or 'stream').
|
The parsing method to use ('lattice' or 'stream').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
|
|
@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
|
|
||||||
validate_input(kwargs, flavor=flavor)
|
validate_input(kwargs, flavor=flavor)
|
||||||
p = PDFHandler(filepath, pages)
|
p = PDFHandler(filepath, pages=pages, password=password)
|
||||||
kwargs = remove_extra(kwargs, flavor=flavor)
|
kwargs = remove_extra(kwargs, flavor=flavor)
|
||||||
tables = p.parse(flavor=flavor, **kwargs)
|
tables = p.parse(flavor=flavor, **kwargs)
|
||||||
return tables
|
return tables
|
||||||
|
|
|
||||||
|
|
@ -9,14 +9,15 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
Usage: camelot [OPTIONS] COMMAND [ARGS]...
|
Usage: camelot [OPTIONS] COMMAND [ARGS]...
|
||||||
|
|
||||||
Camelot: PDF Table Extraction for Humans
|
Camelot: PDF Table Extraction for Humans
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--version Show the version and exit.
|
--version Show the version and exit.
|
||||||
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
|
-p, --pages TEXT Comma-separated page numbers. Example: 1,3,4
|
||||||
or 1,4-end.
|
or 1,4-end.
|
||||||
|
-pw, --password TEXT Password for decryption.
|
||||||
-o, --output TEXT Output file path.
|
-o, --output TEXT Output file path.
|
||||||
-f, --format [csv|json|excel|html]
|
-f, --format [csv|json|excel|html]
|
||||||
Output file format.
|
Output file format.
|
||||||
|
|
@ -27,8 +28,9 @@ You can print the help for the interface by typing ``camelot --help`` in your fa
|
||||||
-M, --margins <FLOAT FLOAT FLOAT>...
|
-M, --margins <FLOAT FLOAT FLOAT>...
|
||||||
PDFMiner char_margin, line_margin and
|
PDFMiner char_margin, line_margin and
|
||||||
word_margin.
|
word_margin.
|
||||||
|
-q, --quiet Suppress warnings.
|
||||||
--help Show this message and exit.
|
--help Show this message and exit.
|
||||||
|
|
||||||
Commands:
|
Commands:
|
||||||
lattice Use lines between text to parse the table.
|
lattice Use lines between text to parse the table.
|
||||||
stream Use spaces between text to parse the table.
|
stream Use spaces between text to parse the table.
|
||||||
|
|
@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
|
||||||
|
|
||||||
The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
|
The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
|
||||||
|
|
||||||
------------------------
|
Reading encrypted PDFs
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() <camelot.read_pdf>`.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf('foo.pdf', password='userpass')
|
||||||
|
>>> tables
|
||||||
|
<TableList n=1>
|
||||||
|
|
||||||
|
Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm.
|
||||||
|
|
||||||
|
Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() <camelot.read_pdf>`. This can been successfully achieved with third-party tools such as `QPDF`_.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ qpdf --password=<PASSWORD> --decrypt input.pdf output.pdf
|
||||||
|
|
||||||
|
.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378
|
||||||
|
.. _QPDF: https://www.github.com/qpdf/qpdf
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
Ready for more? Check out the :ref:`advanced <advanced>` section.
|
Ready for more? Check out the :ref:`advanced <advanced>` section.
|
||||||
Binary file not shown.
|
|
@ -52,6 +52,30 @@ def test_cli_stream():
|
||||||
assert format_error in result.output
|
assert format_error in result.output
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_password():
|
||||||
|
with TemporaryDirectory() as tempdir:
|
||||||
|
infile = os.path.join(testdir, 'health_protected.pdf')
|
||||||
|
outfile = os.path.join(tempdir, 'health_protected.csv')
|
||||||
|
runner = CliRunner()
|
||||||
|
result = runner.invoke(cli, ['--password', 'userpass',
|
||||||
|
'--format', 'csv', '--output', outfile,
|
||||||
|
'stream', infile])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert result.output == 'Found 1 tables\n'
|
||||||
|
|
||||||
|
output_error = 'file has not been decrypted'
|
||||||
|
# no password
|
||||||
|
result = runner.invoke(cli, ['--format', 'csv', '--output', outfile,
|
||||||
|
'stream', infile])
|
||||||
|
assert output_error in str(result.exception)
|
||||||
|
|
||||||
|
# bad password
|
||||||
|
result = runner.invoke(cli, ['--password', 'wrongpass',
|
||||||
|
'--format', 'csv', '--output', outfile,
|
||||||
|
'stream', infile])
|
||||||
|
assert output_error in str(result.exception)
|
||||||
|
|
||||||
|
|
||||||
def test_cli_output_format():
|
def test_cli_output_format():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, 'health.pdf')
|
infile = os.path.join(testdir, 'health.pdf')
|
||||||
|
|
@ -78,7 +102,7 @@ def test_cli_output_format():
|
||||||
'stream', infile])
|
'stream', infile])
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
||||||
def test_cli_quiet_flag():
|
def test_cli_quiet():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, 'blank.pdf')
|
infile = os.path.join(testdir, 'blank.pdf')
|
||||||
outfile = os.path.join(tempdir, 'blank.csv')
|
outfile = os.path.join(tempdir, 'blank.csv')
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,17 @@ def test_parsing_report():
|
||||||
assert tables[0].parsing_report == parsing_report
|
assert tables[0].parsing_report == parsing_report
|
||||||
|
|
||||||
|
|
||||||
|
def test_password():
|
||||||
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "health_protected.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
||||||
|
assert df.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_stream():
|
def test_stream():
|
||||||
df = pd.DataFrame(data_stream)
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -75,3 +75,17 @@ def test_ghostscript_not_found(monkeypatch):
|
||||||
filename = os.path.join(testdir, 'foo.pdf')
|
filename = os.path.join(testdir, 'foo.pdf')
|
||||||
with pytest.raises(Exception, message=message):
|
with pytest.raises(Exception, message=message):
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_password():
|
||||||
|
filename = os.path.join(testdir, 'health_protected.pdf')
|
||||||
|
message = 'file has not been decrypted'
|
||||||
|
with pytest.raises(Exception, message=message):
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bad_password():
|
||||||
|
filename = os.path.join(testdir, 'health_protected.pdf')
|
||||||
|
message = 'file has not been decrypted'
|
||||||
|
with pytest.raises(Exception, message=message):
|
||||||
|
tables = camelot.read_pdf(filename, password='wrongpass')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue