Make ghostscript default backend and add support for string keywords

pull/253/head
Vinayak Mehta 2021-07-11 17:25:56 +05:30
parent f43235934b
commit 8abe02528b
No known key found for this signature in database
GPG Key ID: 2DE013537A15A9A4
8 changed files with 135 additions and 92 deletions

View File

@ -29,8 +29,8 @@ class GhostscriptBackend(object):
def convert(self, pdf_path, png_path, resolution=300):
if not self.installed():
raise OSError(
"Ghostscript is not installed. Please install it using the instructions"
"here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
"Ghostscript is not installed. You can install it using the instructions"
" here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
)
import ghostscript

View File

@ -3,21 +3,21 @@
from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend
backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True):
if backend not in backends.keys():
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")
self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, backends.keys()))
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
def convert(self, pdf_path, png_path):
try:
converter = backends[self.backend]()
converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path)
except Exception as e:
import sys
@ -25,7 +25,7 @@ class ImageConversionBackend(object):
if self.use_fallback:
for fallback in self.fallbacks:
try:
converter = backends[fallback]()
converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path)
except Exception as e:
raise type(e)(

View File

@ -9,7 +9,7 @@ class PopplerBackend(object):
pdftopng_executable = shutil.which("pdftopng")
if pdftopng_executable is None:
raise OSError(
"pdftopng is not installed. Please install it using the `pip install pdftopng` command."
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
)
pdftopng_command = [pdftopng_executable, pdf_path, png_path]

View File

@ -28,7 +28,7 @@ from ..image_processing import (
find_contours,
find_joints,
)
from ..backends import ImageConversionBackend
from ..backends.image_conversion import BACKENDS
logger = logging.getLogger("camelot")
@ -111,7 +111,7 @@ class Lattice(BaseParser):
threshold_constant=-2,
iterations=0,
resolution=300,
backend=ImageConversionBackend(),
backend="ghostscript",
**kwargs,
):
self.table_regions = table_regions
@ -129,7 +129,30 @@ class Lattice(BaseParser):
self.threshold_constant = threshold_constant
self.iterations = iterations
self.resolution = resolution
self.backend = backend
self.backend = Lattice._get_backend(backend)
@staticmethod
def _get_backend(backend):
def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods
if isinstance(backend, str):
if backend in BACKENDS.keys():
return BACKENDS[backend]()
else:
raise NotImplementedError(
f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
)
else:
if not implements_convert():
raise NotImplementedError(
f"'{backend}' must implement a 'convert' method"
)
return backend
@staticmethod
def _reduce_index(t, idx, shift_text):

View File

@ -644,8 +644,5 @@ When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://
In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this::
>>> from camelot.backends.poppler_backend import PopplerBackend
>>> from camelot.backends.ghostscript_backend import GhostscriptBackend
>>>
>>> tables = camelot.read_pdf(filename, backend=PopplerBackend())
>>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
>>> tables = camelot.read_pdf(filename, backend="poppler")
>>> tables = camelot.read_pdf(filename, backend="ghostscript")

View File

@ -32,56 +32,6 @@ def test_unsupported_format():
tables = camelot.read_pdf(filename)
def test_stream_equal_length():
message = "Length of table_areas and columns" " should be equal"
with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf(
filename,
flavor="stream",
table_areas=["10,20,30,40"],
columns=["10,20,30,40", "10,20,30,40"],
)
def test_image_warning():
filename = os.path.join(testdir, "image.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert (
str(e.value)
== "page-1 is image-based, camelot only works on text-based pages."
)
def test_lattice_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="lattice")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_in_area():
filename = os.path.join(testdir, "only_page_number.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found in table area 1"
def test_no_tables_found_logs_suppressed():
filename = os.path.join(testdir, "foo.pdf")
with warnings.catch_warnings():
@ -118,3 +68,68 @@ def test_bad_password():
message = "file has not been decrypted"
with pytest.raises(Exception, match=message):
tables = camelot.read_pdf(filename, password="wrongpass")
def test_stream_equal_length():
message = "Length of table_areas and columns" " should be equal"
with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf(
filename,
flavor="stream",
table_areas=["10,20,30,40"],
columns=["10,20,30,40", "10,20,30,40"],
)
def test_image_warning():
filename = os.path.join(testdir, "image.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert (
str(e.value)
== "page-1 is image-based, camelot only works on text-based pages."
)
def test_stream_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_in_area():
filename = os.path.join(testdir, "only_page_number.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found in table area 1"
def test_lattice_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="lattice")
assert str(e.value) == "No tables found on page-1"
def test_lattice_unknown_backend():
message = "Unknown backend 'mupdf' specified. Please use either 'poppler' or 'ghostscript'."
with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename, backend="mupdf")
def test_lattice_no_convert_method():
class ConversionBackend(object):
pass
message = "must implement a 'convert' method"
with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename, backend=ConversionBackend())

View File

@ -4,18 +4,16 @@ import pytest
import camelot.backends.image_conversion
from camelot.backends import ImageConversionBackend
from camelot.backends.poppler_backend import PopplerBackend
from camelot.backends.ghostscript_backend import GhostscriptBackend
class PopplerBackendError(object):
def convert(self, pdf_path, png_path):
raise ValueError('conversion failed')
raise ValueError("Image conversion failed")
class GhostscriptBackendError(object):
def convert(self, pdf_path, png_path):
raise ValueError('conversion failed')
raise ValueError("Image conversion failed")
class GhostscriptBackendNoError(object):
@ -24,26 +22,39 @@ class GhostscriptBackendNoError(object):
def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError}
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
BACKENDS = {
"poppler": PopplerBackendError,
"ghostscript": GhostscriptBackendNoError,
}
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend(use_fallback=False)
message = "conversion failed with image conversion backend 'poppler'"
message = "Image conversion failed with image conversion backend 'poppler'"
with pytest.raises(ValueError, match=message):
backend.convert('foo', 'bar')
backend.convert("foo", "bar")
def test_ghostscript_backend_when_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError}
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
BACKENDS = {
"poppler": PopplerBackendError,
"ghostscript": GhostscriptBackendNoError,
}
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
backend.convert('foo', 'bar')
backend.convert("foo", "bar")
def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError}
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
BACKENDS = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError}
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend()
message = "conversion failed with image conversion backend 'ghostscript'"
message = "Image conversion failed with image conversion backend 'ghostscript'"
with pytest.raises(ValueError, match=message):
backend.convert('foo', 'bar')
backend.convert("foo", "bar")

View File

@ -6,9 +6,6 @@ import sys
import pytest
import camelot
from camelot.backends.poppler_backend import PopplerBackend
from camelot.backends.ghostscript_backend import GhostscriptBackend
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
@ -31,7 +28,7 @@ def test_textedge_plot():
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_lattice_contour_plot_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend())
tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="contour")
@ -41,7 +38,7 @@ def test_lattice_contour_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="contour")
@ -55,7 +52,7 @@ def test_stream_contour_plot():
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_line_plot_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend())
tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="line")
@ -65,14 +62,14 @@ def test_line_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="line")
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_joint_plot_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend())
tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="joint")
@ -82,14 +79,14 @@ def test_joint_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="joint")
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_grid_plot_poppler():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend())
tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="grid")
@ -99,5 +96,5 @@ def test_grid_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="grid")