diff --git a/camelot/cli.py b/camelot/cli.py index 8385450..e30b204 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -27,6 +27,7 @@ pass_config = click.make_pass_decorator(Config) @click.version_option(version=__version__) @click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' ' Example: 1,3,4 or 1,4-end.') +@click.option('-pw', '--password', help='Password for decryption.') @click.option('-o', '--output', help='Output file path.') @click.option('-f', '--format', type=click.Choice(['csv', 'json', 'excel', 'html']), diff --git a/camelot/handlers.py b/camelot/handlers.py index 6820cc7..b6dc65c 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import sys from PyPDF2 import PdfFileReader, PdfFileWriter @@ -21,14 +22,22 @@ class PDFHandler(object): Path to PDF file. pages : str, optional (default: '1') Comma-separated page numbers. - Example: 1,3,4 or 1,4-end. + Example: '1,3,4' or '1,4-end'. + password : str, optional (default: None) + Password for decryption. """ - def __init__(self, filename, pages='1'): + def __init__(self, filename, pages='1', password=None): self.filename = filename if not filename.lower().endswith('.pdf'): raise NotImplementedError("File format not supported") self.pages = self._get_pages(self.filename, pages) + if password is None: + self.password = '' + else: + self.password = password + if sys.version_info[0] < 3: + self.password = self.password.encode('ascii') def _get_pages(self, filename, pages): """Converts pages string to list of ints. @@ -52,6 +61,8 @@ class PDFHandler(object): page_numbers.append({'start': 1, 'end': 1}) else: infile = PdfFileReader(open(filename, 'rb'), strict=False) + if infile.isEncrypted: + infile.decrypt(self.password) if pages == 'all': page_numbers.append({'start': 1, 'end': infile.getNumPages()}) else: @@ -84,7 +95,7 @@ class PDFHandler(object): with open(filename, 'rb') as fileobj: infile = PdfFileReader(fileobj, strict=False) if infile.isEncrypted: - infile.decrypt('') + infile.decrypt(self.password) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) @@ -103,7 +114,7 @@ class PDFHandler(object): os.rename(fpath, fpath_new) infile = PdfFileReader(open(fpath_new, 'rb'), strict=False) if infile.isEncrypted: - infile.decrypt('') + infile.decrypt(self.password) outfile = PdfFileWriter() p = infile.getPage(0) if rotation == 'anticlockwise': diff --git a/camelot/io.py b/camelot/io.py index 643a2b1..3766a7b 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -5,8 +5,8 @@ from .handlers import PDFHandler from .utils import validate_input, remove_extra -def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False, - **kwargs): +def read_pdf(filepath, pages='1', password=None, flavor='lattice', + suppress_warnings=False, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -19,6 +19,8 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False, pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end'. + password : str, optional (default: None) + Password for decryption. flavor : str (default: 'lattice') The parsing method to use ('lattice' or 'stream'). Lattice is used by default. @@ -94,7 +96,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False, warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages) + p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse(flavor=flavor, **kwargs) return tables diff --git a/docs/user/cli.rst b/docs/user/cli.rst index f96ceae..0dd677c 100644 --- a/docs/user/cli.rst +++ b/docs/user/cli.rst @@ -9,26 +9,28 @@ You can print the help for the interface by typing ``camelot --help`` in your fa :: - Usage: camelot [OPTIONS] COMMAND [ARGS]... +Usage: camelot [OPTIONS] COMMAND [ARGS]... Camelot: PDF Table Extraction for Humans - Options: - --version Show the version and exit. - -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4 - or 1,4-end. - -o, --output TEXT Output file path. - -f, --format [csv|json|excel|html] - Output file format. - -z, --zip Create ZIP archive. - -split, --split_text Split text that spans across multiple cells. - -flag, --flag_size Flag text based on font size. Useful to - detect super/subscripts. - -M, --margins ... - PDFMiner char_margin, line_margin and - word_margin. - --help Show this message and exit. +Options: + --version Show the version and exit. + -p, --pages TEXT Comma-separated page numbers. Example: 1,3,4 + or 1,4-end. + -pw, --password TEXT Password for decryption. + -o, --output TEXT Output file path. + -f, --format [csv|json|excel|html] + Output file format. + -z, --zip Create ZIP archive. + -split, --split_text Split text that spans across multiple cells. + -flag, --flag_size Flag text based on font size. Useful to + detect super/subscripts. + -M, --margins ... + PDFMiner char_margin, line_margin and + word_margin. + -q, --quiet Suppress warnings. + --help Show this message and exit. - Commands: - lattice Use lines between text to parse the table. - stream Use spaces between text to parse the table. \ No newline at end of file +Commands: + lattice Use lines between text to parse the table. + stream Use spaces between text to parse the table. diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index f7c2863..5fb5bc0 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -87,6 +87,28 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``. ------------------------- +Reading encrypted PDFs +---------------------- -Ready for more? Check out the :ref:`advanced ` section. \ No newline at end of file +To extract tables from encrypted PDF files you must provide a password when calling :meth:`read_pdf() `. + +:: + + >>> tables = camelot.read_pdf('foo.pdf', password='userpass') + >>> tables + + +Currently Camelot only supports PDFs encrypted with ASCII passwords and algorithm `code 1 or 2`_. An exception is thrown if the PDF cannot be read. This may be due to no password being provided, an incorrect password, or an unsupported encryption algorithm. + +Further encryption support may be added in future, however in the meantime if your PDF files are using unsupported encryption algorithms you are advised to remove encryption before calling :meth:`read_pdf() `. This can been successfully achieved with third-party tools such as `QPDF`_. + +:: + + $ qpdf --password= --decrypt input.pdf output.pdf + +.. _code 1 or 2: https://github.com/mstamy2/PyPDF2/issues/378 +.. _QPDF: https://www.github.com/qpdf/qpdf + +---- + +Ready for more? Check out the :ref:`advanced ` section. diff --git a/tests/files/health_protected.pdf b/tests/files/health_protected.pdf new file mode 100644 index 0000000..c5ce080 Binary files /dev/null and b/tests/files/health_protected.pdf differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 3f51f8f..4292b6b 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -52,6 +52,30 @@ def test_cli_stream(): assert format_error in result.output +def test_cli_password(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'health_protected.pdf') + outfile = os.path.join(tempdir, 'health_protected.csv') + runner = CliRunner() + result = runner.invoke(cli, ['--password', 'userpass', + '--format', 'csv', '--output', outfile, + 'stream', infile]) + assert result.exit_code == 0 + assert result.output == 'Found 1 tables\n' + + output_error = 'file has not been decrypted' + # no password + result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, + 'stream', infile]) + assert output_error in str(result.exception) + + # bad password + result = runner.invoke(cli, ['--password', 'wrongpass', + '--format', 'csv', '--output', outfile, + 'stream', infile]) + assert output_error in str(result.exception) + + def test_cli_output_format(): with TemporaryDirectory() as tempdir: infile = os.path.join(testdir, 'health.pdf') @@ -78,7 +102,7 @@ def test_cli_output_format(): 'stream', infile]) assert result.exit_code == 0 -def test_cli_quiet_flag(): +def test_cli_quiet(): with TemporaryDirectory() as tempdir: infile = os.path.join(testdir, 'blank.pdf') outfile = os.path.join(tempdir, 'blank.csv') diff --git a/tests/test_common.py b/tests/test_common.py index 1f599fd..bfd1ea6 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -25,6 +25,17 @@ def test_parsing_report(): assert tables[0].parsing_report == parsing_report +def test_password(): + df = pd.DataFrame(data_stream) + + filename = os.path.join(testdir, "health_protected.pdf") + tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream") + assert df.equals(tables[0].df) + + tables = camelot.read_pdf(filename, password="userpass", flavor="stream") + assert df.equals(tables[0].df) + + def test_stream(): df = pd.DataFrame(data_stream) diff --git a/tests/test_errors.py b/tests/test_errors.py index 89db31d..a52aae4 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -75,3 +75,17 @@ def test_ghostscript_not_found(monkeypatch): filename = os.path.join(testdir, 'foo.pdf') with pytest.raises(Exception, message=message): tables = camelot.read_pdf(filename) + + +def test_no_password(): + filename = os.path.join(testdir, 'health_protected.pdf') + message = 'file has not been decrypted' + with pytest.raises(Exception, message=message): + tables = camelot.read_pdf(filename) + + +def test_bad_password(): + filename = os.path.join(testdir, 'health_protected.pdf') + message = 'file has not been decrypted' + with pytest.raises(Exception, message=message): + tables = camelot.read_pdf(filename, password='wrongpass')