Add test for page list generation and fix backend kwarg
parent
8650f25331
commit
cc820b9e5d
|
|
@ -47,9 +47,9 @@ class PDFHandler(object):
|
||||||
self.password = password
|
self.password = password
|
||||||
if sys.version_info[0] < 3:
|
if sys.version_info[0] < 3:
|
||||||
self.password = self.password.encode("ascii")
|
self.password = self.password.encode("ascii")
|
||||||
self.pages = self._get_pages(self.filepath, pages)
|
self.pages = self._get_pages(pages)
|
||||||
|
|
||||||
def _get_pages(self, filepath, pages):
|
def _get_pages(self, pages):
|
||||||
"""Converts pages string to list of ints.
|
"""Converts pages string to list of ints.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -67,25 +67,28 @@ class PDFHandler(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
page_numbers = []
|
page_numbers = []
|
||||||
|
|
||||||
if pages == "1":
|
if pages == "1":
|
||||||
page_numbers.append({"start": 1, "end": 1})
|
page_numbers.append({"start": 1, "end": 1})
|
||||||
else:
|
else:
|
||||||
instream = open(filepath, "rb")
|
with open(self.filepath, "rb") as f:
|
||||||
infile = PdfFileReader(instream, strict=False)
|
infile = PdfFileReader(f, strict=False)
|
||||||
if infile.isEncrypted:
|
|
||||||
infile.decrypt(self.password)
|
if infile.isEncrypted:
|
||||||
if pages == "all":
|
infile.decrypt(self.password)
|
||||||
page_numbers.append({"start": 1, "end": infile.getNumPages()})
|
|
||||||
else:
|
if pages == "all":
|
||||||
for r in pages.split(","):
|
page_numbers.append({"start": 1, "end": infile.getNumPages()})
|
||||||
if "-" in r:
|
else:
|
||||||
a, b = r.split("-")
|
for r in pages.split(","):
|
||||||
if b == "end":
|
if "-" in r:
|
||||||
b = infile.getNumPages()
|
a, b = r.split("-")
|
||||||
page_numbers.append({"start": int(a), "end": int(b)})
|
if b == "end":
|
||||||
else:
|
b = infile.getNumPages()
|
||||||
page_numbers.append({"start": int(r), "end": int(r)})
|
page_numbers.append({"start": int(a), "end": int(b)})
|
||||||
instream.close()
|
else:
|
||||||
|
page_numbers.append({"start": int(r), "end": int(r)})
|
||||||
|
|
||||||
P = []
|
P = []
|
||||||
for p in page_numbers:
|
for p in page_numbers:
|
||||||
P.extend(range(p["start"], p["end"] + 1))
|
P.extend(range(p["start"], p["end"] + 1))
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import pandas as pd
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
|
from camelot.io import PDFHandler
|
||||||
from camelot.core import Table, TableList
|
from camelot.core import Table, TableList
|
||||||
from camelot.__version__ import generate_version
|
from camelot.__version__ import generate_version
|
||||||
from camelot.backends import ImageConversionBackend
|
from camelot.backends import ImageConversionBackend
|
||||||
|
|
@ -60,9 +61,7 @@ def test_password():
|
||||||
|
|
||||||
def test_repr_poppler():
|
def test_repr_poppler():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
filename, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
@ -71,10 +70,7 @@ def test_repr_poppler():
|
||||||
@skip_on_windows
|
@skip_on_windows
|
||||||
def test_repr_ghostscript():
|
def test_repr_ghostscript():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(filename, backend="ghostscript")
|
||||||
filename,
|
|
||||||
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||||
|
|
@ -82,9 +78,7 @@ def test_repr_ghostscript():
|
||||||
|
|
||||||
def test_url_poppler():
|
def test_url_poppler():
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, backend="poppler")
|
||||||
url, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
@ -93,9 +87,7 @@ def test_url_poppler():
|
||||||
@skip_on_windows
|
@skip_on_windows
|
||||||
def test_url_ghostscript():
|
def test_url_ghostscript():
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, backend="ghostscript")
|
||||||
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||||
|
|
@ -103,27 +95,17 @@ def test_url_ghostscript():
|
||||||
|
|
||||||
def test_pages_poppler():
|
def test_pages_poppler():
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, backend="poppler")
|
||||||
url, backend=ImageConversionBackend(backend="poppler", use_fallback=False)
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, pages="1-end", backend="poppler")
|
||||||
url,
|
|
||||||
pages="1-end",
|
|
||||||
backend=ImageConversionBackend(backend="poppler", use_fallback=False),
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, pages="all", backend="poppler")
|
||||||
url,
|
|
||||||
pages="all",
|
|
||||||
backend=ImageConversionBackend(backend="poppler", use_fallback=False),
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
@ -132,27 +114,17 @@ def test_pages_poppler():
|
||||||
@skip_on_windows
|
@skip_on_windows
|
||||||
def test_pages_ghostscript():
|
def test_pages_ghostscript():
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, backend="ghostscript")
|
||||||
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, pages="1-end", backend="ghostscript")
|
||||||
url,
|
|
||||||
pages="1-end",
|
|
||||||
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||||
|
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(url, pages="all", backend="ghostscript")
|
||||||
url,
|
|
||||||
pages="all",
|
|
||||||
backend=ImageConversionBackend(backend="ghostscript", use_fallback=False),
|
|
||||||
)
|
|
||||||
assert repr(tables) == "<TableList n=1>"
|
assert repr(tables) == "<TableList n=1>"
|
||||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||||
|
|
@ -181,3 +153,22 @@ def test_table_order():
|
||||||
(1, 2),
|
(1, 2),
|
||||||
(1, 1),
|
(1, 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_handler_pages_generator():
|
||||||
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
||||||
|
handler = PDFHandler(filename)
|
||||||
|
assert handler._get_pages("1") == [1]
|
||||||
|
|
||||||
|
handler = PDFHandler(filename)
|
||||||
|
assert handler._get_pages("all") == [1]
|
||||||
|
|
||||||
|
handler = PDFHandler(filename)
|
||||||
|
assert handler._get_pages("1-end") == [1]
|
||||||
|
|
||||||
|
handler = PDFHandler(filename)
|
||||||
|
assert handler._get_pages("1,2,3,4") == [1, 2, 3, 4]
|
||||||
|
|
||||||
|
handler = PDFHandler(filename)
|
||||||
|
assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue