diff --git a/camelot/handlers.py b/camelot/handlers.py index fb8d4b5..61585b6 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -47,9 +47,9 @@ class PDFHandler(object): self.password = password if sys.version_info[0] < 3: self.password = self.password.encode("ascii") - self.pages = self._get_pages(self.filepath, pages) + self.pages = self._get_pages(pages) - def _get_pages(self, filepath, pages): + def _get_pages(self, pages): """Converts pages string to list of ints. Parameters @@ -67,25 +67,28 @@ class PDFHandler(object): """ page_numbers = [] + if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - instream = open(filepath, "rb") - infile = PdfFileReader(instream, strict=False) - if infile.isEncrypted: - infile.decrypt(self.password) - if pages == "all": - page_numbers.append({"start": 1, "end": infile.getNumPages()}) - else: - for r in pages.split(","): - if "-" in r: - a, b = r.split("-") - if b == "end": - b = infile.getNumPages() - page_numbers.append({"start": int(a), "end": int(b)}) - else: - page_numbers.append({"start": int(r), "end": int(r)}) - instream.close() + with open(self.filepath, "rb") as f: + infile = PdfFileReader(f, strict=False) + + if infile.isEncrypted: + infile.decrypt(self.password) + + if pages == "all": + page_numbers.append({"start": 1, "end": infile.getNumPages()}) + else: + for r in pages.split(","): + if "-" in r: + a, b = r.split("-") + if b == "end": + b = infile.getNumPages() + page_numbers.append({"start": int(a), "end": int(b)}) + else: + page_numbers.append({"start": int(r), "end": int(r)}) + P = [] for p in page_numbers: P.extend(range(p["start"], p["end"] + 1)) diff --git a/tests/test_common.py b/tests/test_common.py index 9e07efa..5d0054b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -8,6 +8,7 @@ import pandas as pd from pandas.testing import assert_frame_equal import camelot +from camelot.io import PDFHandler from camelot.core import Table, TableList from camelot.__version__ import generate_version from camelot.backends import ImageConversionBackend @@ -60,9 +61,7 @@ def test_password(): def test_repr_poppler(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf( - filename, backend=ImageConversionBackend(backend="poppler", use_fallback=False) - ) + tables = camelot.read_pdf(filename, backend="poppler") assert repr(tables) == "" assert repr(tables[0]) == "" assert repr(tables[0].cells[0][0]) == "" @@ -71,10 +70,7 @@ def test_repr_poppler(): @skip_on_windows def test_repr_ghostscript(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf( - filename, - backend=ImageConversionBackend(backend="ghostscript", use_fallback=False), - ) + tables = camelot.read_pdf(filename, backend="ghostscript") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" @@ -82,9 +78,7 @@ def test_repr_ghostscript(): def test_url_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" - tables = camelot.read_pdf( - url, backend=ImageConversionBackend(backend="poppler", use_fallback=False) - ) + tables = camelot.read_pdf(url, backend="poppler") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" @@ -93,9 +87,7 @@ def test_url_poppler(): @skip_on_windows def test_url_ghostscript(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" - tables = camelot.read_pdf( - url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False) - ) + tables = camelot.read_pdf(url, backend="ghostscript") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" @@ -103,27 +95,17 @@ def test_url_ghostscript(): def test_pages_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" - tables = camelot.read_pdf( - url, backend=ImageConversionBackend(backend="poppler", use_fallback=False) - ) + tables = camelot.read_pdf(url, backend="poppler") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" - tables = camelot.read_pdf( - url, - pages="1-end", - backend=ImageConversionBackend(backend="poppler", use_fallback=False), - ) + tables = camelot.read_pdf(url, pages="1-end", backend="poppler") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" - tables = camelot.read_pdf( - url, - pages="all", - backend=ImageConversionBackend(backend="poppler", use_fallback=False), - ) + tables = camelot.read_pdf(url, pages="all", backend="poppler") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" @@ -132,27 +114,17 @@ def test_pages_poppler(): @skip_on_windows def test_pages_ghostscript(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" - tables = camelot.read_pdf( - url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False) - ) + tables = camelot.read_pdf(url, backend="ghostscript") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" - tables = camelot.read_pdf( - url, - pages="1-end", - backend=ImageConversionBackend(backend="ghostscript", use_fallback=False), - ) + tables = camelot.read_pdf(url, pages="1-end", backend="ghostscript") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" - tables = camelot.read_pdf( - url, - pages="all", - backend=ImageConversionBackend(backend="ghostscript", use_fallback=False), - ) + tables = camelot.read_pdf(url, pages="all", backend="ghostscript") assert repr(tables) == "" assert repr(tables[0]) == "
" assert repr(tables[0].cells[0][0]) == "" @@ -181,3 +153,22 @@ def test_table_order(): (1, 2), (1, 1), ] + + +def test_handler_pages_generator(): + filename = os.path.join(testdir, "foo.pdf") + + handler = PDFHandler(filename) + assert handler._get_pages("1") == [1] + + handler = PDFHandler(filename) + assert handler._get_pages("all") == [1] + + handler = PDFHandler(filename) + assert handler._get_pages("1-end") == [1] + + handler = PDFHandler(filename) + assert handler._get_pages("1,2,3,4") == [1, 2, 3, 4] + + handler = PDFHandler(filename) + assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]