Merge pull request #253 from camelot-dev/make-gs-default-backend
Make ghostscript default backend and add support for string keyword argumentspull/254/head
commit
3c04842d97
|
|
@ -6,7 +6,7 @@ master
|
||||||
|
|
||||||
**Improvements**
|
**Improvements**
|
||||||
|
|
||||||
- Add pdftopng for image conversion and use ghostscript as fallback. [#198](https://github.com/camelot-dev/camelot/pull/198) by Vinayak Mehta.
|
- Add support for multiple image conversion backends. [#198](https://github.com/camelot-dev/camelot/pull/198) and [#253](https://github.com/camelot-dev/camelot/pull/253) by Vinayak Mehta.
|
||||||
- Add markdown export format. [#222](https://github.com/camelot-dev/camelot/pull/222/) by [Lucas Cimon](https://github.com/Lucas-C).
|
- Add markdown export format. [#222](https://github.com/camelot-dev/camelot/pull/222/) by [Lucas Cimon](https://github.com/Lucas-C).
|
||||||
|
|
||||||
**Documentation**
|
**Documentation**
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,8 @@ class GhostscriptBackend(object):
|
||||||
def convert(self, pdf_path, png_path, resolution=300):
|
def convert(self, pdf_path, png_path, resolution=300):
|
||||||
if not self.installed():
|
if not self.installed():
|
||||||
raise OSError(
|
raise OSError(
|
||||||
"Ghostscript is not installed. Please install it using the instructions"
|
"Ghostscript is not installed. You can install it using the instructions"
|
||||||
"here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
|
" here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
|
||||||
)
|
)
|
||||||
|
|
||||||
import ghostscript
|
import ghostscript
|
||||||
|
|
|
||||||
|
|
@ -3,21 +3,21 @@
|
||||||
from .poppler_backend import PopplerBackend
|
from .poppler_backend import PopplerBackend
|
||||||
from .ghostscript_backend import GhostscriptBackend
|
from .ghostscript_backend import GhostscriptBackend
|
||||||
|
|
||||||
backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
|
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
|
||||||
|
|
||||||
|
|
||||||
class ImageConversionBackend(object):
|
class ImageConversionBackend(object):
|
||||||
def __init__(self, backend="poppler", use_fallback=True):
|
def __init__(self, backend="poppler", use_fallback=True):
|
||||||
if backend not in backends.keys():
|
if backend not in BACKENDS.keys():
|
||||||
raise ValueError(f"Image conversion backend '{backend}' not supported")
|
raise ValueError(f"Image conversion backend '{backend}' not supported")
|
||||||
|
|
||||||
self.backend = backend
|
self.backend = backend
|
||||||
self.use_fallback = use_fallback
|
self.use_fallback = use_fallback
|
||||||
self.fallbacks = list(filter(lambda x: x != backend, backends.keys()))
|
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
|
||||||
|
|
||||||
def convert(self, pdf_path, png_path):
|
def convert(self, pdf_path, png_path):
|
||||||
try:
|
try:
|
||||||
converter = backends[self.backend]()
|
converter = BACKENDS[self.backend]()
|
||||||
converter.convert(pdf_path, png_path)
|
converter.convert(pdf_path, png_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -25,7 +25,7 @@ class ImageConversionBackend(object):
|
||||||
if self.use_fallback:
|
if self.use_fallback:
|
||||||
for fallback in self.fallbacks:
|
for fallback in self.fallbacks:
|
||||||
try:
|
try:
|
||||||
converter = backends[fallback]()
|
converter = BACKENDS[fallback]()
|
||||||
converter.convert(pdf_path, png_path)
|
converter.convert(pdf_path, png_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise type(e)(
|
raise type(e)(
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ class PopplerBackend(object):
|
||||||
pdftopng_executable = shutil.which("pdftopng")
|
pdftopng_executable = shutil.which("pdftopng")
|
||||||
if pdftopng_executable is None:
|
if pdftopng_executable is None:
|
||||||
raise OSError(
|
raise OSError(
|
||||||
"pdftopng is not installed. Please install it using the `pip install pdftopng` command."
|
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
|
||||||
)
|
)
|
||||||
|
|
||||||
pdftopng_command = [pdftopng_executable, pdf_path, png_path]
|
pdftopng_command = [pdftopng_executable, pdf_path, png_path]
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ from ..image_processing import (
|
||||||
find_contours,
|
find_contours,
|
||||||
find_joints,
|
find_joints,
|
||||||
)
|
)
|
||||||
from ..backends import ImageConversionBackend
|
from ..backends.image_conversion import BACKENDS
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("camelot")
|
logger = logging.getLogger("camelot")
|
||||||
|
|
@ -111,7 +111,7 @@ class Lattice(BaseParser):
|
||||||
threshold_constant=-2,
|
threshold_constant=-2,
|
||||||
iterations=0,
|
iterations=0,
|
||||||
resolution=300,
|
resolution=300,
|
||||||
backend=ImageConversionBackend(),
|
backend="ghostscript",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.table_regions = table_regions
|
self.table_regions = table_regions
|
||||||
|
|
@ -129,7 +129,37 @@ class Lattice(BaseParser):
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
self.resolution = resolution
|
self.resolution = resolution
|
||||||
self.backend = backend
|
self.backend = Lattice._get_backend(backend)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_backend(backend):
|
||||||
|
def implements_convert():
|
||||||
|
methods = [
|
||||||
|
method for method in dir(backend) if method.startswith("__") is False
|
||||||
|
]
|
||||||
|
return "convert" in methods
|
||||||
|
|
||||||
|
if isinstance(backend, str):
|
||||||
|
if backend in BACKENDS.keys():
|
||||||
|
if backend == "ghostscript":
|
||||||
|
warnings.warn(
|
||||||
|
"'ghostscript' will be replaced by 'poppler' as the default image conversion"
|
||||||
|
" backend in v0.12.0. You can try out 'poppler' with backend='poppler'.",
|
||||||
|
DeprecationWarning
|
||||||
|
)
|
||||||
|
|
||||||
|
return BACKENDS[backend]()
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if not implements_convert():
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"'{backend}' must implement a 'convert' method"
|
||||||
|
)
|
||||||
|
|
||||||
|
return backend
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _reduce_index(t, idx, shift_text):
|
def _reduce_index(t, idx, shift_text):
|
||||||
|
|
|
||||||
|
|
@ -629,7 +629,14 @@ To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://gith
|
||||||
Use alternate image conversion backends
|
Use alternate image conversion backends
|
||||||
---------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://github.com/vinayak-mehta/pdftopng>`_ to convert PDF pages to images for line recognition. This should work out of the box on most operating systems. However, if you get an error, you can supply your own image conversion backend to Camelot::
|
When using the :ref:`Lattice <lattice>` flavor, Camelot uses ``ghostscript`` to convert PDF pages to images for line recognition. If you face installation issues with ``ghostscript``, you can use an alternate image conversion backend called ``poppler``. You can specify which image conversion backend you want to use with::
|
||||||
|
|
||||||
|
>>> tables = camelot.read_pdf(filename, backend="ghostscript") # default
|
||||||
|
>>> tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
|
|
||||||
|
.. note:: ``poppler`` will be made the default image conversion backend (replacing ``ghostscript``) with ``v0.12.0``.
|
||||||
|
|
||||||
|
If you face issues with both ``ghostscript`` and ``poppler``, you can supply your own image conversion backend::
|
||||||
|
|
||||||
>>> class ConversionBackend(object):
|
>>> class ConversionBackend(object):
|
||||||
>>> def convert(pdf_path, png_path):
|
>>> def convert(pdf_path, png_path):
|
||||||
|
|
@ -639,13 +646,3 @@ When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://
|
||||||
>>> pass
|
>>> pass
|
||||||
>>>
|
>>>
|
||||||
>>> tables = camelot.read_pdf(filename, backend=ConversionBackend())
|
>>> tables = camelot.read_pdf(filename, backend=ConversionBackend())
|
||||||
|
|
||||||
.. note:: If image conversion using ``pdftopng`` fails, Camelot falls back to ``ghostscript`` to try image conversion again, and if that fails, it raises an error.
|
|
||||||
|
|
||||||
In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this::
|
|
||||||
|
|
||||||
>>> from camelot.backends.poppler_backend import PopplerBackend
|
|
||||||
>>> from camelot.backends.ghostscript_backend import GhostscriptBackend
|
|
||||||
>>>
|
|
||||||
>>> tables = camelot.read_pdf(filename, backend=PopplerBackend())
|
|
||||||
>>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
from click.testing import CliRunner
|
from click.testing import CliRunner
|
||||||
|
|
||||||
from camelot.cli import cli
|
from camelot.cli import cli
|
||||||
|
|
@ -11,6 +13,11 @@ from camelot.utils import TemporaryDirectory
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
skip_on_windows = pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"),
|
||||||
|
reason="Ghostscript not installed in Windows test environment",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_help_output():
|
def test_help_output():
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
|
|
@ -26,6 +33,7 @@ def test_help_output():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_cli_lattice():
|
def test_cli_lattice():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, "foo.pdf")
|
infile = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
@ -35,7 +43,7 @@ def test_cli_lattice():
|
||||||
cli, ["--format", "csv", "--output", outfile, "lattice", infile]
|
cli, ["--format", "csv", "--output", outfile, "lattice", infile]
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert result.output == "Found 1 tables\n"
|
assert "Found 1 tables" in result.output
|
||||||
|
|
||||||
result = runner.invoke(cli, ["--format", "csv", "lattice", infile])
|
result = runner.invoke(cli, ["--format", "csv", "lattice", infile])
|
||||||
output_error = "Error: Please specify output file path using --output"
|
output_error = "Error: Please specify output file path using --output"
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
|
@ -16,6 +17,11 @@ from .data import *
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
skip_on_windows = pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"),
|
||||||
|
reason="Ghostscript not installed in Windows test environment",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_version_generation():
|
def test_version_generation():
|
||||||
version = (0, 7, 3)
|
version = (0, 7, 3)
|
||||||
|
|
@ -32,6 +38,7 @@ def test_version_generation_with_prerelease_revision():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_parsing_report():
|
def test_parsing_report():
|
||||||
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
||||||
|
|
||||||
|
|
@ -61,10 +68,8 @@ def test_repr_poppler():
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_repr_ghostscript():
|
def test_repr_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
return True
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename,
|
filename,
|
||||||
|
|
@ -85,10 +90,8 @@ def test_url_poppler():
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_url_ghostscript():
|
def test_url_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
return True
|
|
||||||
|
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
||||||
|
|
@ -126,10 +129,8 @@ def test_pages_poppler():
|
||||||
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_pages_ghostscript():
|
def test_pages_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
return True
|
|
||||||
|
|
||||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -12,6 +13,11 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
||||||
|
skip_on_windows = pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"),
|
||||||
|
reason="Ghostscript not installed in Windows test environment",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_flavor():
|
def test_unknown_flavor():
|
||||||
message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
|
||||||
|
|
@ -32,56 +38,7 @@ def test_unsupported_format():
|
||||||
tables = camelot.read_pdf(filename)
|
tables = camelot.read_pdf(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_stream_equal_length():
|
@skip_on_windows
|
||||||
message = "Length of table_areas and columns" " should be equal"
|
|
||||||
with pytest.raises(ValueError, match=message):
|
|
||||||
tables = camelot.read_pdf(
|
|
||||||
filename,
|
|
||||||
flavor="stream",
|
|
||||||
table_areas=["10,20,30,40"],
|
|
||||||
columns=["10,20,30,40", "10,20,30,40"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_image_warning():
|
|
||||||
filename = os.path.join(testdir, "image.pdf")
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
with pytest.raises(UserWarning) as e:
|
|
||||||
tables = camelot.read_pdf(filename)
|
|
||||||
assert (
|
|
||||||
str(e.value)
|
|
||||||
== "page-1 is image-based, camelot only works on text-based pages."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_lattice_no_tables_on_page():
|
|
||||||
filename = os.path.join(testdir, "empty.pdf")
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
with pytest.raises(UserWarning) as e:
|
|
||||||
tables = camelot.read_pdf(filename, flavor="lattice")
|
|
||||||
assert str(e.value) == "No tables found on page-1"
|
|
||||||
|
|
||||||
|
|
||||||
def test_stream_no_tables_on_page():
|
|
||||||
filename = os.path.join(testdir, "empty.pdf")
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
with pytest.raises(UserWarning) as e:
|
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
|
||||||
assert str(e.value) == "No tables found on page-1"
|
|
||||||
|
|
||||||
|
|
||||||
def test_stream_no_tables_in_area():
|
|
||||||
filename = os.path.join(testdir, "only_page_number.pdf")
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("error")
|
|
||||||
with pytest.raises(UserWarning) as e:
|
|
||||||
tables = camelot.read_pdf(filename, flavor="stream")
|
|
||||||
assert str(e.value) == "No tables found in table area 1"
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_tables_found_logs_suppressed():
|
def test_no_tables_found_logs_suppressed():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
@ -118,3 +75,81 @@ def test_bad_password():
|
||||||
message = "file has not been decrypted"
|
message = "file has not been decrypted"
|
||||||
with pytest.raises(Exception, match=message):
|
with pytest.raises(Exception, match=message):
|
||||||
tables = camelot.read_pdf(filename, password="wrongpass")
|
tables = camelot.read_pdf(filename, password="wrongpass")
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_equal_length():
|
||||||
|
message = "Length of table_areas and columns" " should be equal"
|
||||||
|
with pytest.raises(ValueError, match=message):
|
||||||
|
tables = camelot.read_pdf(
|
||||||
|
filename,
|
||||||
|
flavor="stream",
|
||||||
|
table_areas=["10,20,30,40"],
|
||||||
|
columns=["10,20,30,40", "10,20,30,40"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_warning():
|
||||||
|
filename = os.path.join(testdir, "image.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error", category=UserWarning)
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
assert (
|
||||||
|
str(e.value)
|
||||||
|
== "page-1 is image-based, camelot only works on text-based pages."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_no_tables_on_page():
|
||||||
|
filename = os.path.join(testdir, "empty.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert str(e.value) == "No tables found on page-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_no_tables_in_area():
|
||||||
|
filename = os.path.join(testdir, "only_page_number.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename, flavor="stream")
|
||||||
|
assert str(e.value) == "No tables found in table area 1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice_no_tables_on_page():
|
||||||
|
filename = os.path.join(testdir, "empty.pdf")
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error", category=UserWarning)
|
||||||
|
with pytest.raises(UserWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename, flavor="lattice")
|
||||||
|
assert str(e.value) == "No tables found on page-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice_unknown_backend():
|
||||||
|
message = "Unknown backend 'mupdf' specified. Please use either 'poppler' or 'ghostscript'."
|
||||||
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
|
tables = camelot.read_pdf(filename, backend="mupdf")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice_no_convert_method():
|
||||||
|
class ConversionBackend(object):
|
||||||
|
pass
|
||||||
|
|
||||||
|
message = "must implement a 'convert' method"
|
||||||
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
|
tables = camelot.read_pdf(filename, backend=ConversionBackend())
|
||||||
|
|
||||||
|
|
||||||
|
def test_lattice_ghostscript_deprecation_warning():
|
||||||
|
ghostscript_deprecation_warning = (
|
||||||
|
"'ghostscript' will be replaced by 'poppler' as the default image conversion"
|
||||||
|
" backend in v0.12.0. You can try out 'poppler' with backend='poppler'."
|
||||||
|
)
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("error")
|
||||||
|
with pytest.raises(DeprecationWarning) as e:
|
||||||
|
tables = camelot.read_pdf(filename)
|
||||||
|
assert str(e.value) == ghostscript_deprecation_warning
|
||||||
|
|
|
||||||
|
|
@ -4,18 +4,16 @@ import pytest
|
||||||
|
|
||||||
import camelot.backends.image_conversion
|
import camelot.backends.image_conversion
|
||||||
from camelot.backends import ImageConversionBackend
|
from camelot.backends import ImageConversionBackend
|
||||||
from camelot.backends.poppler_backend import PopplerBackend
|
|
||||||
from camelot.backends.ghostscript_backend import GhostscriptBackend
|
|
||||||
|
|
||||||
|
|
||||||
class PopplerBackendError(object):
|
class PopplerBackendError(object):
|
||||||
def convert(self, pdf_path, png_path):
|
def convert(self, pdf_path, png_path):
|
||||||
raise ValueError('conversion failed')
|
raise ValueError("Image conversion failed")
|
||||||
|
|
||||||
|
|
||||||
class GhostscriptBackendError(object):
|
class GhostscriptBackendError(object):
|
||||||
def convert(self, pdf_path, png_path):
|
def convert(self, pdf_path, png_path):
|
||||||
raise ValueError('conversion failed')
|
raise ValueError("Image conversion failed")
|
||||||
|
|
||||||
|
|
||||||
class GhostscriptBackendNoError(object):
|
class GhostscriptBackendNoError(object):
|
||||||
|
|
@ -24,26 +22,39 @@ class GhostscriptBackendNoError(object):
|
||||||
|
|
||||||
|
|
||||||
def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
|
def test_poppler_backend_error_when_no_use_fallback(monkeypatch):
|
||||||
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError}
|
BACKENDS = {
|
||||||
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
|
"poppler": PopplerBackendError,
|
||||||
|
"ghostscript": GhostscriptBackendNoError,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
|
||||||
|
)
|
||||||
backend = ImageConversionBackend(use_fallback=False)
|
backend = ImageConversionBackend(use_fallback=False)
|
||||||
|
|
||||||
message = "conversion failed with image conversion backend 'poppler'"
|
message = "Image conversion failed with image conversion backend 'poppler'"
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
backend.convert('foo', 'bar')
|
backend.convert("foo", "bar")
|
||||||
|
|
||||||
|
|
||||||
def test_ghostscript_backend_when_use_fallback(monkeypatch):
|
def test_ghostscript_backend_when_use_fallback(monkeypatch):
|
||||||
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendNoError}
|
BACKENDS = {
|
||||||
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
|
"poppler": PopplerBackendError,
|
||||||
|
"ghostscript": GhostscriptBackendNoError,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
|
||||||
|
)
|
||||||
backend = ImageConversionBackend()
|
backend = ImageConversionBackend()
|
||||||
backend.convert('foo', 'bar')
|
backend.convert("foo", "bar")
|
||||||
|
|
||||||
|
|
||||||
def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
|
def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
|
||||||
backends = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError}
|
BACKENDS = {"poppler": PopplerBackendError, "ghostscript": GhostscriptBackendError}
|
||||||
monkeypatch.setattr("camelot.backends.image_conversion.backends", backends, raising=True)
|
monkeypatch.setattr(
|
||||||
|
"camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True
|
||||||
|
)
|
||||||
backend = ImageConversionBackend()
|
backend = ImageConversionBackend()
|
||||||
|
|
||||||
message = "conversion failed with image conversion backend 'ghostscript'"
|
message = "Image conversion failed with image conversion backend 'ghostscript'"
|
||||||
with pytest.raises(ValueError, match=message):
|
with pytest.raises(ValueError, match=message):
|
||||||
backend.convert('foo', 'bar')
|
backend.convert("foo", "bar")
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,9 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
|
@ -14,7 +16,13 @@ from .data import *
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
skip_on_windows = pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"),
|
||||||
|
reason="Ghostscript not installed in Windows test environment",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice():
|
def test_lattice():
|
||||||
df = pd.DataFrame(data_lattice)
|
df = pd.DataFrame(data_lattice)
|
||||||
|
|
||||||
|
|
@ -25,6 +33,7 @@ def test_lattice():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_table_rotated():
|
def test_lattice_table_rotated():
|
||||||
df = pd.DataFrame(data_lattice_table_rotated)
|
df = pd.DataFrame(data_lattice_table_rotated)
|
||||||
|
|
||||||
|
|
@ -37,6 +46,7 @@ def test_lattice_table_rotated():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_two_tables():
|
def test_lattice_two_tables():
|
||||||
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
||||||
|
|
@ -48,6 +58,7 @@ def test_lattice_two_tables():
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_table_regions():
|
def test_lattice_table_regions():
|
||||||
df = pd.DataFrame(data_lattice_table_regions)
|
df = pd.DataFrame(data_lattice_table_regions)
|
||||||
|
|
||||||
|
|
@ -56,6 +67,7 @@ def test_lattice_table_regions():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_table_areas():
|
def test_lattice_table_areas():
|
||||||
df = pd.DataFrame(data_lattice_table_areas)
|
df = pd.DataFrame(data_lattice_table_areas)
|
||||||
|
|
||||||
|
|
@ -64,6 +76,7 @@ def test_lattice_table_areas():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_process_background():
|
def test_lattice_process_background():
|
||||||
df = pd.DataFrame(data_lattice_process_background)
|
df = pd.DataFrame(data_lattice_process_background)
|
||||||
|
|
||||||
|
|
@ -72,6 +85,7 @@ def test_lattice_process_background():
|
||||||
assert_frame_equal(df, tables[1].df)
|
assert_frame_equal(df, tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_copy_text():
|
def test_lattice_copy_text():
|
||||||
df = pd.DataFrame(data_lattice_copy_text)
|
df = pd.DataFrame(data_lattice_copy_text)
|
||||||
|
|
||||||
|
|
@ -80,6 +94,7 @@ def test_lattice_copy_text():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_shift_text():
|
def test_lattice_shift_text():
|
||||||
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
||||||
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
||||||
|
|
@ -96,6 +111,7 @@ def test_lattice_shift_text():
|
||||||
assert df_rb.equals(tables[0].df)
|
assert df_rb.equals(tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
def test_lattice_arabic():
|
def test_lattice_arabic():
|
||||||
df = pd.DataFrame(data_arabic)
|
df = pd.DataFrame(data_arabic)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,14 +6,17 @@ import sys
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import camelot
|
import camelot
|
||||||
from camelot.backends.poppler_backend import PopplerBackend
|
|
||||||
from camelot.backends.ghostscript_backend import GhostscriptBackend
|
|
||||||
|
|
||||||
|
|
||||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||||
testdir = os.path.join(testdir, "files")
|
testdir = os.path.join(testdir, "files")
|
||||||
|
|
||||||
|
skip_on_windows = pytest.mark.skipif(
|
||||||
|
sys.platform.startswith("win"),
|
||||||
|
reason="Ghostscript not installed in Windows test environment",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_text_plot():
|
def test_text_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
|
|
@ -31,17 +34,15 @@ def test_textedge_plot():
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_lattice_contour_plot_poppler():
|
def test_lattice_contour_plot_poppler():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=PopplerBackend())
|
tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
return camelot.plot(tables[0], kind="contour")
|
return camelot.plot(tables[0], kind="contour")
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_lattice_contour_plot_ghostscript():
|
def test_lattice_contour_plot_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
pytest.skip("Skipping ghostscript test on Windows")
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
|
tables = camelot.read_pdf(filename, backend="ghostscript")
|
||||||
return camelot.plot(tables[0], kind="contour")
|
return camelot.plot(tables[0], kind="contour")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -55,49 +56,43 @@ def test_stream_contour_plot():
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_line_plot_poppler():
|
def test_line_plot_poppler():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=PopplerBackend())
|
tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
return camelot.plot(tables[0], kind="line")
|
return camelot.plot(tables[0], kind="line")
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_line_plot_ghostscript():
|
def test_line_plot_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
pytest.skip("Skipping ghostscript test on Windows")
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
|
tables = camelot.read_pdf(filename, backend="ghostscript")
|
||||||
return camelot.plot(tables[0], kind="line")
|
return camelot.plot(tables[0], kind="line")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_joint_plot_poppler():
|
def test_joint_plot_poppler():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=PopplerBackend())
|
tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
return camelot.plot(tables[0], kind="joint")
|
return camelot.plot(tables[0], kind="joint")
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_joint_plot_ghostscript():
|
def test_joint_plot_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
pytest.skip("Skipping ghostscript test on Windows")
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
|
tables = camelot.read_pdf(filename, backend="ghostscript")
|
||||||
return camelot.plot(tables[0], kind="joint")
|
return camelot.plot(tables[0], kind="joint")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_grid_plot_poppler():
|
def test_grid_plot_poppler():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=PopplerBackend())
|
tables = camelot.read_pdf(filename, backend="poppler")
|
||||||
return camelot.plot(tables[0], kind="grid")
|
return camelot.plot(tables[0], kind="grid")
|
||||||
|
|
||||||
|
|
||||||
|
@skip_on_windows
|
||||||
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
@pytest.mark.mpl_image_compare(baseline_dir="files/baseline_plots", remove_text=True)
|
||||||
def test_grid_plot_ghostscript():
|
def test_grid_plot_ghostscript():
|
||||||
if sys.platform not in ["linux", "darwin"]:
|
|
||||||
pytest.skip("Skipping ghostscript test on Windows")
|
|
||||||
|
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
|
tables = camelot.read_pdf(filename, backend="ghostscript")
|
||||||
return camelot.plot(tables[0], kind="grid")
|
return camelot.plot(tables[0], kind="grid")
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.testing import assert_frame_equal
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue