Make ghostscript default backend and add support for string keywords

pull/253/head
Vinayak Mehta 2021-07-11 17:25:56 +05:30
parent f43235934b
commit 8abe02528b
No known key found for this signature in database
GPG Key ID: 2DE013537A15A9A4
8 changed files with 135 additions and 92 deletions

View File

@ -29,8 +29,8 @@ class GhostscriptBackend(object):
def convert(self, pdf_path, png_path, resolution=300): def convert(self, pdf_path, png_path, resolution=300):
if not self.installed(): if not self.installed():
raise OSError( raise OSError(
"Ghostscript is not installed. Please install it using the instructions" "Ghostscript is not installed. You can install it using the instructions"
"here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html" " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
) )
import ghostscript import ghostscript

View File

@ -3,21 +3,21 @@
from .poppler_backend import PopplerBackend from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend from .ghostscript_backend import GhostscriptBackend
backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
class ImageConversionBackend(object): class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True): def __init__(self, backend="poppler", use_fallback=True):
if backend not in backends.keys(): if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported") raise ValueError(f"Image conversion backend '{backend}' not supported")
self.backend = backend self.backend = backend
self.use_fallback = use_fallback self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, backends.keys())) self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
def convert(self, pdf_path, png_path): def convert(self, pdf_path, png_path):
try: try:
converter = backends[self.backend]() converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path) converter.convert(pdf_path, png_path)
except Exception as e: except Exception as e:
import sys import sys
@ -25,7 +25,7 @@ class ImageConversionBackend(object):
if self.use_fallback: if self.use_fallback:
for fallback in self.fallbacks: for fallback in self.fallbacks:
try: try:
converter = backends[fallback]() converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path) converter.convert(pdf_path, png_path)
except Exception as e: except Exception as e:
raise type(e)( raise type(e)(

View File

@ -9,7 +9,7 @@ class PopplerBackend(object):
pdftopng_executable = shutil.which("pdftopng") pdftopng_executable = shutil.which("pdftopng")
if pdftopng_executable is None: if pdftopng_executable is None:
raise OSError( raise OSError(
"pdftopng is not installed. Please install it using the `pip install pdftopng` command." "pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
) )
pdftopng_command = [pdftopng_executable, pdf_path, png_path] pdftopng_command = [pdftopng_executable, pdf_path, png_path]

View File

@ -28,7 +28,7 @@ from ..image_processing import (
find_contours, find_contours,
find_joints, find_joints,
) )
from ..backends import ImageConversionBackend from ..backends.image_conversion import BACKENDS
logger = logging.getLogger("camelot") logger = logging.getLogger("camelot")
@ -111,7 +111,7 @@ class Lattice(BaseParser):
threshold_constant=-2, threshold_constant=-2,
iterations=0, iterations=0,
resolution=300, resolution=300,
backend=ImageConversionBackend(), backend="ghostscript",
**kwargs, **kwargs,
): ):
self.table_regions = table_regions self.table_regions = table_regions
@ -129,7 +129,30 @@ class Lattice(BaseParser):
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.iterations = iterations self.iterations = iterations
self.resolution = resolution self.resolution = resolution
self.backend = backend self.backend = Lattice._get_backend(backend)
@staticmethod
def _get_backend(backend):
def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods
if isinstance(backend, str):
if backend in BACKENDS.keys():
return BACKENDS[backend]()
else:
raise NotImplementedError(
f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
)
else:
if not implements_convert():
raise NotImplementedError(
f"'{backend}' must implement a 'convert' method"
)
return backend
@staticmethod @staticmethod
def _reduce_index(t, idx, shift_text): def _reduce_index(t, idx, shift_text):

View File

@ -644,8 +644,5 @@ When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://
In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this:: In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this::
>>> from camelot.backends.poppler_backend import PopplerBackend >>> tables = camelot.read_pdf(filename, backend="poppler")
>>> from camelot.backends.ghostscript_backend import GhostscriptBackend >>> tables = camelot.read_pdf(filename, backend="ghostscript")
>>>
>>> tables = camelot.read_pdf(filename, backend=PopplerBackend())
>>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend())

View File

@ -32,56 +32,6 @@ def test_unsupported_format():
tables = camelot.read_pdf(filename) tables = camelot.read_pdf(filename)
def test_stream_equal_length():
message = "Length of table_areas and columns" " should be equal"
with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf(
filename,
flavor="stream",
table_areas=["10,20,30,40"],
columns=["10,20,30,40", "10,20,30,40"],
)
def test_image_warning():
filename = os.path.join(testdir, "image.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert (
str(e.value)
== "page-1 is image-based, camelot only works on text-based pages."
)
def test_lattice_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="lattice")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_in_area():
filename = os.path.join(testdir, "only_page_number.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found in table area 1"
def test_no_tables_found_logs_suppressed(): def test_no_tables_found_logs_suppressed():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
with warnings.catch_warnings(): with warnings.catch_warnings():
@ -118,3 +68,68 @@ def test_bad_password():
message = "file has not been decrypted" message = "file has not been decrypted"
with pytest.raises(Exception, match=message): with pytest.raises(Exception, match=message):
tables = camelot.read_pdf(filename, password="wrongpass") tables = camelot.read_pdf(filename, password="wrongpass")
def test_stream_equal_length():
message = "Length of table_areas and columns" " should be equal"
with pytest.raises(ValueError, match=message):
tables = camelot.read_pdf(
filename,
flavor="stream",
table_areas=["10,20,30,40"],
columns=["10,20,30,40", "10,20,30,40"],
)
def test_image_warning():
filename = os.path.join(testdir, "image.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename)
assert (
str(e.value)
== "page-1 is image-based, camelot only works on text-based pages."
)
def test_stream_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found on page-1"
def test_stream_no_tables_in_area():
filename = os.path.join(testdir, "only_page_number.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="stream")
assert str(e.value) == "No tables found in table area 1"
def test_lattice_no_tables_on_page():
filename = os.path.join(testdir, "empty.pdf")
with warnings.catch_warnings():
warnings.simplefilter("error")
with pytest.raises(UserWarning) as e:
tables = camelot.read_pdf(filename, flavor="lattice")
assert str(e.value) == "No tables found on page-1"
def test_lattice_unknown_backend():
message = "Unknown backend 'mupdf' specified. Please use either 'poppler' or 'ghostscript'."
with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename, backend="mupdf")
def test_lattice_no_convert_method():
class ConversionBackend(object):
pass
message = "must implement a 'convert' method"
with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(filename, backend=ConversionBackend())

View File

@ -4,18 +4,16 @@ import pytest
import camelot.backends.image_conversion import camelot.backends.image_conversion
from camelot.backends import ImageConversionBackend from camelot.backends import ImageConversionBackend
from camelot.backends.poppler_backend import PopplerBackend
from camelot.backends.ghostscript_backend import GhostscriptBackend
class PopplerBackendError(object): class PopplerBackendError(object):
def convert(self, pdf_path, png_path): def convert(self, pdf_path, png_path):
raise ValueError('conversion failed') raise ValueError("Image conversion failed")
class GhostscriptBackendError(object): class GhostscriptBackendError(object):
def convert(self, pdf_path, png_path): def convert(self, pdf_path, png_path):
raise ValueError('conversion failed') raise ValueError("Image conversion failed")
class GhostscriptBackendNoError(object): class GhostscriptBackendNoError(object):
@ -24,26 +22,39 @@ class GhostscriptBackendNoError(object):
def test_poppler_backend_error_when_no_use_fallback(monkeypatch): def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError} BACKENDS = {
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) "poppler": PopplerBackendError,
"ghostscript": GhostscriptBackendNoError,
}
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend(use_fallback=False) backend = ImageConversionBackend(use_fallback=False)
message = "conversion failed with image conversion backend 'poppler'" message = "Image conversion failed with image conversion backend 'poppler'"
with pytest.raises(ValueError, match=message): with pytest.raises(ValueError, match=message):
backend.convert('foo', 'bar') backend.convert("foo", "bar")
def test_ghostscript_backend_when_use_fallback(monkeypatch): def test_ghostscript_backend_when_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError} BACKENDS = {
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) "poppler": PopplerBackendError,
"ghostscript": GhostscriptBackendNoError,
}
monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend() backend = ImageConversionBackend()
backend.convert('foo', 'bar') backend.convert("foo", "bar")
def test_ghostscript_backend_error_when_use_fallback(monkeypatch): def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError} BACKENDS = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError}
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True) monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
)
backend = ImageConversionBackend() backend = ImageConversionBackend()
message = "conversion failed with image conversion backend 'ghostscript'" message = "Image conversion failed with image conversion backend 'ghostscript'"
with pytest.raises(ValueError, match=message): with pytest.raises(ValueError, match=message):
backend.convert('foo', 'bar') backend.convert("foo", "bar")

View File

@ -6,9 +6,6 @@ import sys
import pytest import pytest
import camelot import camelot
from camelot.backends.poppler_backend import PopplerBackend
from camelot.backends.ghostscript_backend import GhostscriptBackend
testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files") testdir = os.path.join(testdir, "files")
@ -31,7 +28,7 @@ def test_textedge_plot():
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_lattice_contour_plot_poppler(): def test_lattice_contour_plot_poppler():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend()) tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="contour") return camelot.plot(tables[0], kind="contour")
@ -41,7 +38,7 @@ def test_lattice_contour_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows") pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="contour") return camelot.plot(tables[0], kind="contour")
@ -55,7 +52,7 @@ def test_stream_contour_plot():
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_line_plot_poppler(): def test_line_plot_poppler():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend()) tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="line") return camelot.plot(tables[0], kind="line")
@ -65,14 +62,14 @@ def test_line_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows") pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="line") return camelot.plot(tables[0], kind="line")
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_joint_plot_poppler(): def test_joint_plot_poppler():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend()) tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="joint") return camelot.plot(tables[0], kind="joint")
@ -82,14 +79,14 @@ def test_joint_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows") pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="joint") return camelot.plot(tables[0], kind="joint")
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True) @pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
def test_grid_plot_poppler(): def test_grid_plot_poppler():
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=PopplerBackend()) tables = camelot.read_pdf(filename, backend="poppler")
return camelot.plot(tables[0], kind="grid") return camelot.plot(tables[0], kind="grid")
@ -99,5 +96,5 @@ def test_grid_plot_ghostscript():
pytest.skip("Skipping ghostscript test on Windows") pytest.skip("Skipping ghostscript test on Windows")
filename = os.path.join(testdir, "foo.pdf") filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, backend=GhostscriptBackend()) tables = camelot.read_pdf(filename, backend="ghostscript")
return camelot.plot(tables[0], kind="grid") return camelot.plot(tables[0], kind="grid")