From 8abe02528b4f0daaaeb9e36df2ca6d24afde33d4 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sun, 11 Jul 2021 17:25:56 +0530 Subject: [PATCH] Make ghostscript default backend and add support for string keywords --- camelot/backends/ghostscript_backend.py | 4 +- camelot/backends/image_conversion.py | 10 +-- camelot/backends/poppler_backend.py | 2 +- camelot/parsers/lattice.py | 29 +++++- docs/user/advanced.rst | 7 +- tests/test_errors.py | 115 +++++++++++++----------- tests/test_image_conversion_backend.py | 41 +++++---- tests/test_plotting.py | 19 ++-- 8 files changed, 135 insertions(+), 92 deletions(-) diff --git a/camelot/backends/ghostscript_backend.py b/camelot/backends/ghostscript_backend.py index 5e93cdb..1de7da1 100644 --- a/camelot/backends/ghostscript_backend.py +++ b/camelot/backends/ghostscript_backend.py @@ -29,8 +29,8 @@ class GhostscriptBackend(object): def convert(self, pdf_path, png_path, resolution=300): if not self.installed(): raise OSError( - "Ghostscript is not installed. Please install it using the instructions" - "here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html" + "Ghostscript is not installed. You can install it using the instructions" + " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html" ) import ghostscript diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py index a9b6004..7d2c4d7 100644 --- a/camelot/backends/image_conversion.py +++ b/camelot/backends/image_conversion.py @@ -3,21 +3,21 @@ from .poppler_backend import PopplerBackend from .ghostscript_backend import GhostscriptBackend -backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} +BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} class ImageConversionBackend(object): def __init__(self, backend="poppler", use_fallback=True): - if backend not in backends.keys(): + if backend not in BACKENDS.keys(): raise ValueError(f"Image conversion backend '{backend}' not supported") self.backend = backend self.use_fallback = use_fallback - self.fallbacks = list(filter(lambda x: x != backend, backends.keys())) + self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys())) def convert(self, pdf_path, png_path): try: - converter = backends[self.backend]() + converter = BACKENDS[self.backend]() converter.convert(pdf_path, png_path) except Exception as e: import sys @@ -25,7 +25,7 @@ class ImageConversionBackend(object): if self.use_fallback: for fallback in self.fallbacks: try: - converter = backends[fallback]() + converter = BACKENDS[fallback]() converter.convert(pdf_path, png_path) except Exception as e: raise type(e)( diff --git a/camelot/backends/poppler_backend.py b/camelot/backends/poppler_backend.py index ab12bcf..4103372 100644 --- a/camelot/backends/poppler_backend.py +++ b/camelot/backends/poppler_backend.py @@ -9,7 +9,7 @@ class PopplerBackend(object): pdftopng_executable = shutil.which("pdftopng") if pdftopng_executable is None: raise OSError( - "pdftopng is not installed. Please install it using the `pip install pdftopng` command." + "pdftopng is not installed. You can install it using the 'pip install pdftopng' command." ) pdftopng_command = [pdftopng_executable, pdf_path, png_path] diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 02ef794..2fbd195 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -28,7 +28,7 @@ from ..image_processing import ( find_contours, find_joints, ) -from ..backends import ImageConversionBackend +from ..backends.image_conversion import BACKENDS logger = logging.getLogger("camelot") @@ -111,7 +111,7 @@ class Lattice(BaseParser): threshold_constant=-2, iterations=0, resolution=300, - backend=ImageConversionBackend(), + backend="ghostscript", **kwargs, ): self.table_regions = table_regions @@ -129,7 +129,30 @@ class Lattice(BaseParser): self.threshold_constant = threshold_constant self.iterations = iterations self.resolution = resolution - self.backend = backend + self.backend = Lattice._get_backend(backend) + + @staticmethod + def _get_backend(backend): + def implements_convert(): + methods = [ + method for method in dir(backend) if method.startswith("__") is False + ] + return "convert" in methods + + if isinstance(backend, str): + if backend in BACKENDS.keys(): + return BACKENDS[backend]() + else: + raise NotImplementedError( + f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'." + ) + else: + if not implements_convert(): + raise NotImplementedError( + f"'{backend}' must implement a 'convert' method" + ) + + return backend @staticmethod def _reduce_index(t, idx, shift_text): diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 09bfc4e..6a551d9 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -644,8 +644,5 @@ When using the :ref:`Lattice ` flavor, Camelot uses `pdftopng >> from camelot.backends.poppler_backend import PopplerBackend - >>> from camelot.backends.ghostscript_backend import GhostscriptBackend - >>> - >>> tables = camelot.read_pdf(filename, backend=PopplerBackend()) - >>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) + >>> tables = camelot.read_pdf(filename, backend="poppler") + >>> tables = camelot.read_pdf(filename, backend="ghostscript") diff --git a/tests/test_errors.py b/tests/test_errors.py index 595c54b..f488aee 100755 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -32,56 +32,6 @@ def test_unsupported_format(): tables = camelot.read_pdf(filename) -def test_stream_equal_length(): - message = "Length of table_areas and columns" " should be equal" - with pytest.raises(ValueError, match=message): - tables = camelot.read_pdf( - filename, - flavor="stream", - table_areas=["10,20,30,40"], - columns=["10,20,30,40", "10,20,30,40"], - ) - - -def test_image_warning(): - filename = os.path.join(testdir, "image.pdf") - with warnings.catch_warnings(): - warnings.simplefilter("error") - with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename) - assert ( - str(e.value) - == "page-1 is image-based, camelot only works on text-based pages." - ) - - -def test_lattice_no_tables_on_page(): - filename = os.path.join(testdir, "empty.pdf") - with warnings.catch_warnings(): - warnings.simplefilter("error") - with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename, flavor="lattice") - assert str(e.value) == "No tables found on page-1" - - -def test_stream_no_tables_on_page(): - filename = os.path.join(testdir, "empty.pdf") - with warnings.catch_warnings(): - warnings.simplefilter("error") - with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename, flavor="stream") - assert str(e.value) == "No tables found on page-1" - - -def test_stream_no_tables_in_area(): - filename = os.path.join(testdir, "only_page_number.pdf") - with warnings.catch_warnings(): - warnings.simplefilter("error") - with pytest.raises(UserWarning) as e: - tables = camelot.read_pdf(filename, flavor="stream") - assert str(e.value) == "No tables found in table area 1" - - def test_no_tables_found_logs_suppressed(): filename = os.path.join(testdir, "foo.pdf") with warnings.catch_warnings(): @@ -118,3 +68,68 @@ def test_bad_password(): message = "file has not been decrypted" with pytest.raises(Exception, match=message): tables = camelot.read_pdf(filename, password="wrongpass") + + +def test_stream_equal_length(): + message = "Length of table_areas and columns" " should be equal" + with pytest.raises(ValueError, match=message): + tables = camelot.read_pdf( + filename, + flavor="stream", + table_areas=["10,20,30,40"], + columns=["10,20,30,40", "10,20,30,40"], + ) + + +def test_image_warning(): + filename = os.path.join(testdir, "image.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename) + assert ( + str(e.value) + == "page-1 is image-based, camelot only works on text-based pages." + ) + + +def test_stream_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found on page-1" + + +def test_stream_no_tables_in_area(): + filename = os.path.join(testdir, "only_page_number.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="stream") + assert str(e.value) == "No tables found in table area 1" + + +def test_lattice_no_tables_on_page(): + filename = os.path.join(testdir, "empty.pdf") + with warnings.catch_warnings(): + warnings.simplefilter("error") + with pytest.raises(UserWarning) as e: + tables = camelot.read_pdf(filename, flavor="lattice") + assert str(e.value) == "No tables found on page-1" + + +def test_lattice_unknown_backend(): + message = "Unknown backend 'mupdf' specified. Please use either 'poppler' or 'ghostscript'." + with pytest.raises(NotImplementedError, match=message): + tables = camelot.read_pdf(filename, backend="mupdf") + + +def test_lattice_no_convert_method(): + class ConversionBackend(object): + pass + + message = "must implement a 'convert' method" + with pytest.raises(NotImplementedError, match=message): + tables = camelot.read_pdf(filename, backend=ConversionBackend()) diff --git a/tests/test_image_conversion_backend.py b/tests/test_image_conversion_backend.py index 8074cac..39f56e6 100644 --- a/tests/test_image_conversion_backend.py +++ b/tests/test_image_conversion_backend.py @@ -4,18 +4,16 @@ import pytest import camelot.backends.image_conversion from camelot.backends import ImageConversionBackend -from camelot.backends.poppler_backend import PopplerBackend -from camelot.backends.ghostscript_backend import GhostscriptBackend class PopplerBackendError(object): def convert(self, pdf_path, png_path): - raise ValueError('conversion failed') + raise ValueError("Image conversion failed") class GhostscriptBackendError(object): def convert(self, pdf_path, png_path): - raise ValueError('conversion failed') + raise ValueError("Image conversion failed") class GhostscriptBackendNoError(object): @@ -24,26 +22,39 @@ class GhostscriptBackendNoError(object): def test_poppler_backend_error_when_no_use_fallback(monkeypatch): - backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError} - monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) + BACKENDS = { + "poppler": PopplerBackendError, + "ghostscript": GhostscriptBackendNoError, + } + monkeypatch.setattr( + "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True + ) backend = ImageConversionBackend(use_fallback=False) - message = "conversion failed with image conversion backend 'poppler'" + message = "Image conversion failed with image conversion backend 'poppler'" with pytest.raises(ValueError, match=message): - backend.convert('foo', 'bar') + backend.convert("foo", "bar") + def test_ghostscript_backend_when_use_fallback(monkeypatch): - backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError} - monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) + BACKENDS = { + "poppler": PopplerBackendError, + "ghostscript": GhostscriptBackendNoError, + } + monkeypatch.setattr( + "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True + ) backend = ImageConversionBackend() - backend.convert('foo', 'bar') + backend.convert("foo", "bar") def test_ghostscript_backend_error_when_use_fallback(monkeypatch): - backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError} - monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) + BACKENDS = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError} + monkeypatch.setattr( + "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True + ) backend = ImageConversionBackend() - message = "conversion failed with image conversion backend 'ghostscript'" + message = "Image conversion failed with image conversion backend 'ghostscript'" with pytest.raises(ValueError, match=message): - backend.convert('foo', 'bar') + backend.convert("foo", "bar") diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 22d9f6f..1ef178f 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -6,9 +6,6 @@ import sys import pytest import camelot -from camelot.backends.poppler_backend import PopplerBackend -from camelot.backends.ghostscript_backend import GhostscriptBackend - testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") @@ -31,7 +28,7 @@ def test_textedge_plot(): @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_lattice_contour_plot_poppler(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=PopplerBackend()) + tables = camelot.read_pdf(filename, backend="poppler") return camelot.plot(tables[0], kind="contour") @@ -41,7 +38,7 @@ def test_lattice_contour_plot_ghostscript(): pytest.skip("Skipping ghostscript test on Windows") filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) + tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="contour") @@ -55,7 +52,7 @@ def test_stream_contour_plot(): @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_line_plot_poppler(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=PopplerBackend()) + tables = camelot.read_pdf(filename, backend="poppler") return camelot.plot(tables[0], kind="line") @@ -65,14 +62,14 @@ def test_line_plot_ghostscript(): pytest.skip("Skipping ghostscript test on Windows") filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) + tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="line") @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_joint_plot_poppler(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=PopplerBackend()) + tables = camelot.read_pdf(filename, backend="poppler") return camelot.plot(tables[0], kind="joint") @@ -82,14 +79,14 @@ def test_joint_plot_ghostscript(): pytest.skip("Skipping ghostscript test on Windows") filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) + tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="joint") @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) def test_grid_plot_poppler(): filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=PopplerBackend()) + tables = camelot.read_pdf(filename, backend="poppler") return camelot.plot(tables[0], kind="grid") @@ -99,5 +96,5 @@ def test_grid_plot_ghostscript(): pytest.skip("Skipping ghostscript test on Windows") filename = os.path.join(testdir, "foo.pdf") - tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) + tables = camelot.read_pdf(filename, backend="ghostscript") return camelot.plot(tables[0], kind="grid")