Add test for page list generation and fix backend kwarg

pull/254/head
Vinayak Mehta 2021-07-11 22:47:01 +05:30
parent 8650f25331
commit cc820b9e5d
No known key found for this signature in database
GPG Key ID: 2DE013537A15A9A4
2 changed files with 51 additions and 57 deletions

View File

@ -47,9 +47,9 @@ class PDFHandler(object):
self.password = password self.password = password
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
self.password = self.password.encode("ascii") self.password = self.password.encode("ascii")
self.pages = self._get_pages(self.filepath, pages) self.pages = self._get_pages(pages)
def _get_pages(self, filepath, pages): def _get_pages(self, pages):
"""Converts pages string to list of ints. """Converts pages string to list of ints.
Parameters Parameters
@ -67,13 +67,16 @@ class PDFHandler(object):
""" """
page_numbers = [] page_numbers = []
if pages == "1": if pages == "1":
page_numbers.append({"start": 1, "end": 1}) page_numbers.append({"start": 1, "end": 1})
else: else:
instream = open(filepath, "rb") with open(self.filepath, "rb") as f:
infile = PdfFileReader(instream, strict=False) infile = PdfFileReader(f, strict=False)
if infile.isEncrypted: if infile.isEncrypted:
infile.decrypt(self.password) infile.decrypt(self.password)
if pages == "all": if pages == "all":
page_numbers.append({"start": 1, "end": infile.getNumPages()}) page_numbers.append({"start": 1, "end": infile.getNumPages()})
else: else:
@ -85,7 +88,7 @@ class PDFHandler(object):
page_numbers.append({"start": int(a), "end": int(b)}) page_numbers.append({"start": int(a), "end": int(b)})
else: else:
page_numbers.append({"start": int(r), "end": int(r)}) page_numbers.append({"start": int(r), "end": int(r)})
instream.close()
P = [] P = []
for p in page_numbers: for p in page_numbers:
P.extend(range(p["start"], p["end"] + 1)) P.extend(range(p["start"], p["end"] + 1))

View File

@ -8,6 +8,7 @@ import pandas as pd
from pandas.testing import assert_frame_equal from pandas.testing import assert_frame_equal
import camelot import camelot
from camelot.io import PDFHandler
from camelot.core import Table, TableList from camelot.core import Table, TableList
from camelot.__version__ import generate_version from camelot.__version__ import generate_version
from camelot.backends import ImageConversionBackend from camelot.backends import ImageConversionBackend
@ -60,9 +61,7 @@ def test_password():
def test_repr_poppler(): def test_repr_poppler():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(filename, backend="poppler")
filename, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
@ -71,10 +70,7 @@ def test_repr_poppler():
@skip_on_windows @skip_on_windows
def test_repr_ghostscript(): def test_repr_ghostscript():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf( tables = camelot.read_pdf(filename, backend="ghostscript")
filename,
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
@ -82,9 +78,7 @@ def test_repr_ghostscript():
def test_url_poppler(): def test_url_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, backend="poppler")
url, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
@ -93,9 +87,7 @@ def test_url_poppler():
@skip_on_windows @skip_on_windows
def test_url_ghostscript(): def test_url_ghostscript():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, backend="ghostscript")
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
@ -103,27 +95,17 @@ def test_url_ghostscript():
def test_pages_poppler(): def test_pages_poppler():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, backend="poppler")
url, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, pages="1-end", backend="poppler")
url,
pages="1-end",
backend=ImageConversionBackend(backend="poppler", use_fallback=False),
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, pages="all", backend="poppler")
url,
pages="all",
backend=ImageConversionBackend(backend="poppler", use_fallback=False),
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
@ -132,27 +114,17 @@ def test_pages_poppler():
@skip_on_windows @skip_on_windows
def test_pages_ghostscript(): def test_pages_ghostscript():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, backend="ghostscript")
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, pages="1-end", backend="ghostscript")
url,
pages="1-end",
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
tables = camelot.read_pdf( tables = camelot.read_pdf(url, pages="all", backend="ghostscript")
url,
pages="all",
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
)
assert repr(tables) == "<TableList n=1>" assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>" assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>" assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
@ -181,3 +153,22 @@ def test_table_order():
(1, 2), (1, 2),
(1, 1), (1, 1),
] ]
def test_handler_pages_generator():
filename = os.path.join(testdir, "foo.pdf")
handler = PDFHandler(filename)
assert handler._get_pages("1") == [1]
handler = PDFHandler(filename)
assert handler._get_pages("all") == [1]
handler = PDFHandler(filename)
assert handler._get_pages("1-end") == [1]
handler = PDFHandler(filename)
assert handler._get_pages("1,2,3,4") == [1, 2, 3, 4]
handler = PDFHandler(filename)
assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]