diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py index 9aeaecf..004bce7 100644 --- a/camelot/backends/image_conversion.py +++ b/camelot/backends/image_conversion.py @@ -23,7 +23,9 @@ class ImageConversionBackend(object): converter = backends[self.backend]() converter.convert(pdf_path, png_path) except Exception as e: - logger.info(f"Image conversion backend '{self.backend}' failed with '{str(e)}'") + logger.info( + f"Image conversion backend '{self.backend}' failed with '{str(e)}'" + ) if self.use_fallback: for fallback in self.fallbacks: @@ -33,7 +35,9 @@ class ImageConversionBackend(object): converter = backends[fallback]() converter.convert(pdf_path, png_path) except Exception as e: - logger.info(f"Image conversion backend '{fallback}' failed with '{str(e)}'") + logger.info( + f"Image conversion backend '{fallback}' failed with '{str(e)}'" + ) continue else: logger.info(f"Image conversion backend '{fallback}' succeeded") diff --git a/camelot/core.py b/camelot/core.py index 63ddc15..58a98ef 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -288,10 +288,10 @@ class Cell(object): self._text = "" def __repr__(self): - x1 = round(self.x1, 2) - y1 = round(self.y1, 2) - x2 = round(self.x2, 2) - y2 = round(self.y2, 2) + x1 = round(self.x1) + y1 = round(self.y1) + x2 = round(self.x2) + y2 = round(self.y2) return f"" @property diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 50530cc..937e867 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -112,6 +112,7 @@ class Lattice(BaseParser): threshold_constant=-2, iterations=0, resolution=300, + backend=ImageConversionBackend(), **kwargs, ): self.table_regions = table_regions @@ -129,7 +130,7 @@ class Lattice(BaseParser): self.threshold_constant = threshold_constant self.iterations = iterations self.resolution = resolution - self.backend = ImageConversionBackend() + self.backend = backend @staticmethod def _reduce_index(t, idx, shift_text): diff --git a/tests/test_common.py b/tests/test_common.py index cb9a968..ddb8de2 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -8,6 +8,7 @@ from pandas.testing import assert_frame_equal import camelot from camelot.core import Table, TableList from camelot.__version__ import generate_version +from camelot.backends import ImageConversionBackend from .data import * @@ -15,6 +16,21 @@ testdir = os.path.dirname(os.path.abspath(__file__)) testdir = os.path.join(testdir, "files") +def test_version_generation(): + version = (0, 7, 3) + assert generate_version(version, prerelease=None, revision=None) == "0.7.3" + + +def test_version_generation_with_prerelease_revision(): + version = (0, 7, 3) + prerelease = "alpha" + revision = 2 + assert ( + generate_version(version, prerelease=prerelease, revision=revision) + == "0.7.3-alpha.2" + ) + + def test_parsing_report(): parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1} @@ -34,246 +50,92 @@ def test_password(): assert_frame_equal(df, tables[0].df) -def test_stream(): - df = pd.DataFrame(data_stream) - - filename = os.path.join(testdir, "health.pdf") - tables = camelot.read_pdf(filename, flavor="stream") - assert_frame_equal(df, tables[0].df) - - -def test_stream_table_rotated(): - df = pd.DataFrame(data_stream_table_rotated) - - filename = os.path.join(testdir, "clockwise_table_2.pdf") - tables = camelot.read_pdf(filename, flavor="stream") - assert_frame_equal(df, tables[0].df) - - filename = os.path.join(testdir, "anticlockwise_table_2.pdf") - tables = camelot.read_pdf(filename, flavor="stream") - assert_frame_equal(df, tables[0].df) - - -def test_stream_two_tables(): - df1 = pd.DataFrame(data_stream_two_tables_1) - df2 = pd.DataFrame(data_stream_two_tables_2) - - filename = os.path.join(testdir, "tabula/12s0324.pdf") - tables = camelot.read_pdf(filename, flavor="stream") - - assert len(tables) == 2 - assert df1.equals(tables[0].df) - assert df2.equals(tables[1].df) - - -def test_stream_table_regions(): - df = pd.DataFrame(data_stream_table_areas) - - filename = os.path.join(testdir, "tabula/us-007.pdf") - tables = camelot.read_pdf( - filename, flavor="stream", table_regions=["320,460,573,335"] - ) - assert_frame_equal(df, tables[0].df) - - -def test_stream_table_areas(): - df = pd.DataFrame(data_stream_table_areas) - - filename = os.path.join(testdir, "tabula/us-007.pdf") - tables = camelot.read_pdf( - filename, flavor="stream", table_areas=["320,500,573,335"] - ) - assert_frame_equal(df, tables[0].df) - - -def test_stream_columns(): - df = pd.DataFrame(data_stream_columns) - - filename = os.path.join(testdir, "mexican_towns.pdf") - tables = camelot.read_pdf( - filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10 - ) - assert_frame_equal(df, tables[0].df) - - -def test_stream_split_text(): - df = pd.DataFrame(data_stream_split_text) - - filename = os.path.join(testdir, "tabula/m27.pdf") - tables = camelot.read_pdf( - filename, - flavor="stream", - columns=["72,95,209,327,442,529,566,606,683"], - split_text=True, - ) - assert_frame_equal(df, tables[0].df) - - -def test_stream_flag_size(): - df = pd.DataFrame(data_stream_flag_size) - - filename = os.path.join(testdir, "superscript.pdf") - tables = camelot.read_pdf(filename, flavor="stream", flag_size=True) - assert_frame_equal(df, tables[0].df) - - -def test_stream_strip_text(): - df = pd.DataFrame(data_stream_strip_text) - - filename = os.path.join(testdir, "detect_vertical_false.pdf") - tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n") - assert_frame_equal(df, tables[0].df) - - -def test_stream_edge_tol(): - df = pd.DataFrame(data_stream_edge_tol) - - filename = os.path.join(testdir, "edge_tol.pdf") - tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500) - assert_frame_equal(df, tables[0].df) - - -def test_stream_layout_kwargs(): - df = pd.DataFrame(data_stream_layout_kwargs) - - filename = os.path.join(testdir, "detect_vertical_false.pdf") - tables = camelot.read_pdf( - filename, flavor="stream", layout_kwargs={"detect_vertical": False} - ) - assert_frame_equal(df, tables[0].df) - - -def test_lattice(): - df = pd.DataFrame(data_lattice) - - filename = os.path.join( - testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf" - ) - tables = camelot.read_pdf(filename, pages="2") - assert_frame_equal(df, tables[0].df) - - -def test_lattice_table_rotated(): - df = pd.DataFrame(data_lattice_table_rotated) - - filename = os.path.join(testdir, "clockwise_table_1.pdf") - tables = camelot.read_pdf(filename) - assert_frame_equal(df, tables[0].df) - - filename = os.path.join(testdir, "anticlockwise_table_1.pdf") - tables = camelot.read_pdf(filename) - assert_frame_equal(df, tables[0].df) - - -def test_lattice_two_tables(): - df1 = pd.DataFrame(data_lattice_two_tables_1) - df2 = pd.DataFrame(data_lattice_two_tables_2) - - filename = os.path.join(testdir, "twotables_2.pdf") - tables = camelot.read_pdf(filename) - assert len(tables) == 2 - assert df1.equals(tables[0].df) - assert df2.equals(tables[1].df) - - -def test_lattice_table_regions(): - df = pd.DataFrame(data_lattice_table_regions) - - filename = os.path.join(testdir, "table_region.pdf") - tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"]) - assert_frame_equal(df, tables[0].df) - - -def test_lattice_table_areas(): - df = pd.DataFrame(data_lattice_table_areas) - - filename = os.path.join(testdir, "twotables_2.pdf") - tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"]) - assert_frame_equal(df, tables[0].df) - - -def test_lattice_process_background(): - df = pd.DataFrame(data_lattice_process_background) - - filename = os.path.join(testdir, "background_lines_1.pdf") - tables = camelot.read_pdf(filename, process_background=True) - assert_frame_equal(df, tables[1].df) - - -def test_lattice_copy_text(): - df = pd.DataFrame(data_lattice_copy_text) - - filename = os.path.join(testdir, "row_span_1.pdf") - tables = camelot.read_pdf(filename, line_scale=60, copy_text="v") - assert_frame_equal(df, tables[0].df) - - -def test_lattice_shift_text(): - df_lt = pd.DataFrame(data_lattice_shift_text_left_top) - df_disable = pd.DataFrame(data_lattice_shift_text_disable) - df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) - - filename = os.path.join(testdir, "column_span_2.pdf") - tables = camelot.read_pdf(filename, line_scale=40) - assert df_lt.equals(tables[0].df) - - tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""]) - assert df_disable.equals(tables[0].df) - - tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"]) - assert df_rb.equals(tables[0].df) - - -def test_repr(): +def test_repr_poppler(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename) assert repr(tables) == "" assert repr(tables[0]) == "" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == "" ) -def test_pages(): +def test_repr_ghostscript(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, backend=ImageConversionBackend(backend="ghostscript")) + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert ( + repr(tables[0].cells[0][0]) == "" + ) + + +def test_url_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url) assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == "" + ) + + +def test_url_ghostscript(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend=ImageConversionBackend(backend="ghostscript")) + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert ( + repr(tables[0].cells[0][0]) == "" + ) + + +def test_pages_poppler(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url) + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert ( + repr(tables[0].cells[0][0]) == "" ) tables = camelot.read_pdf(url, pages="1-end") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == "" ) tables = camelot.read_pdf(url, pages="all") assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == "" ) -def test_url(): +def test_pages_ghostscript(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url) assert repr(tables) == "" assert repr(tables[0]) == "
" assert ( - repr(tables[0].cells[0][0]) == "" + repr(tables[0].cells[0][0]) == "" ) + tables = camelot.read_pdf(url, pages="1-end") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert ( + repr(tables[0].cells[0][0]) == "" + ) -def test_arabic(): - df = pd.DataFrame(data_arabic) - - filename = os.path.join(testdir, "tabula/arabic.pdf") - tables = camelot.read_pdf(filename) - assert_frame_equal(df, tables[0].df) + tables = camelot.read_pdf(url, pages="all") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert ( + repr(tables[0].cells[0][0]) == "" + ) def test_table_order(): @@ -299,26 +161,3 @@ def test_table_order(): (1, 2), (1, 1), ] - - -def test_version_generation(): - version = (0, 7, 3) - assert generate_version(version, prerelease=None, revision=None) == "0.7.3" - - -def test_version_generation_with_prerelease_revision(): - version = (0, 7, 3) - prerelease = "alpha" - revision = 2 - assert ( - generate_version(version, prerelease=prerelease, revision=revision) - == "0.7.3-alpha.2" - ) - - -def test_stream_duplicated_text(): - df = pd.DataFrame(data_stream_duplicated_text) - - filename = os.path.join(testdir, "birdisland.pdf") - tables = camelot.read_pdf(filename, flavor="stream") - assert_frame_equal(df, tables[0].df) diff --git a/tests/test_lattice.py b/tests/test_lattice.py new file mode 100644 index 0000000..7706b4a --- /dev/null +++ b/tests/test_lattice.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +import os + +import pandas as pd +from pandas.testing import assert_frame_equal + +import camelot +from camelot.core import Table, TableList +from camelot.__version__ import generate_version + +from .data import * + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + + +def test_lattice(): + df = pd.DataFrame(data_lattice) + + filename = os.path.join( + testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf" + ) + tables = camelot.read_pdf(filename, pages="2") + assert_frame_equal(df, tables[0].df) + + +def test_lattice_table_rotated(): + df = pd.DataFrame(data_lattice_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_1.pdf") + tables = camelot.read_pdf(filename) + assert_frame_equal(df, tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_1.pdf") + tables = camelot.read_pdf(filename) + assert_frame_equal(df, tables[0].df) + + +def test_lattice_two_tables(): + df1 = pd.DataFrame(data_lattice_two_tables_1) + df2 = pd.DataFrame(data_lattice_two_tables_2) + + filename = os.path.join(testdir, "twotables_2.pdf") + tables = camelot.read_pdf(filename) + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + +def test_lattice_table_regions(): + df = pd.DataFrame(data_lattice_table_regions) + + filename = os.path.join(testdir, "table_region.pdf") + tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"]) + assert_frame_equal(df, tables[0].df) + + +def test_lattice_table_areas(): + df = pd.DataFrame(data_lattice_table_areas) + + filename = os.path.join(testdir, "twotables_2.pdf") + tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"]) + assert_frame_equal(df, tables[0].df) + + +def test_lattice_process_background(): + df = pd.DataFrame(data_lattice_process_background) + + filename = os.path.join(testdir, "background_lines_1.pdf") + tables = camelot.read_pdf(filename, process_background=True) + assert_frame_equal(df, tables[1].df) + + +def test_lattice_copy_text(): + df = pd.DataFrame(data_lattice_copy_text) + + filename = os.path.join(testdir, "row_span_1.pdf") + tables = camelot.read_pdf(filename, line_scale=60, copy_text="v") + assert_frame_equal(df, tables[0].df) + + +def test_lattice_shift_text(): + df_lt = pd.DataFrame(data_lattice_shift_text_left_top) + df_disable = pd.DataFrame(data_lattice_shift_text_disable) + df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom) + + filename = os.path.join(testdir, "column_span_2.pdf") + tables = camelot.read_pdf(filename, line_scale=40) + assert df_lt.equals(tables[0].df) + + tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""]) + assert df_disable.equals(tables[0].df) + + tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"]) + assert df_rb.equals(tables[0].df) + + +def test_lattice_arabic(): + df = pd.DataFrame(data_arabic) + + filename = os.path.join(testdir, "tabula/arabic.pdf") + tables = camelot.read_pdf(filename) + assert_frame_equal(df, tables[0].df) diff --git a/tests/test_stream.py b/tests/test_stream.py new file mode 100644 index 0000000..4a0ec0c --- /dev/null +++ b/tests/test_stream.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +import os + +import pandas as pd +from pandas.testing import assert_frame_equal + +import camelot +from camelot.core import Table, TableList +from camelot.__version__ import generate_version + +from .data import * + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, "files") + + +def test_stream(): + df = pd.DataFrame(data_stream) + + filename = os.path.join(testdir, "health.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df) + + +def test_stream_table_rotated(): + df = pd.DataFrame(data_stream_table_rotated) + + filename = os.path.join(testdir, "clockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df) + + filename = os.path.join(testdir, "anticlockwise_table_2.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df) + + +def test_stream_two_tables(): + df1 = pd.DataFrame(data_stream_two_tables_1) + df2 = pd.DataFrame(data_stream_two_tables_2) + + filename = os.path.join(testdir, "tabula/12s0324.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + + assert len(tables) == 2 + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df) + + +def test_stream_table_regions(): + df = pd.DataFrame(data_stream_table_areas) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", table_regions=["320,460,573,335"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_stream_table_areas(): + df = pd.DataFrame(data_stream_table_areas) + + filename = os.path.join(testdir, "tabula/us-007.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", table_areas=["320,500,573,335"] + ) + assert_frame_equal(df, tables[0].df) + + +def test_stream_columns(): + df = pd.DataFrame(data_stream_columns) + + filename = os.path.join(testdir, "mexican_towns.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10 + ) + assert_frame_equal(df, tables[0].df) + + +def test_stream_split_text(): + df = pd.DataFrame(data_stream_split_text) + + filename = os.path.join(testdir, "tabula/m27.pdf") + tables = camelot.read_pdf( + filename, + flavor="stream", + columns=["72,95,209,327,442,529,566,606,683"], + split_text=True, + ) + assert_frame_equal(df, tables[0].df) + + +def test_stream_flag_size(): + df = pd.DataFrame(data_stream_flag_size) + + filename = os.path.join(testdir, "superscript.pdf") + tables = camelot.read_pdf(filename, flavor="stream", flag_size=True) + assert_frame_equal(df, tables[0].df) + + +def test_stream_strip_text(): + df = pd.DataFrame(data_stream_strip_text) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n") + assert_frame_equal(df, tables[0].df) + + +def test_stream_edge_tol(): + df = pd.DataFrame(data_stream_edge_tol) + + filename = os.path.join(testdir, "edge_tol.pdf") + tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500) + assert_frame_equal(df, tables[0].df) + + +def test_stream_layout_kwargs(): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", layout_kwargs={"detect_vertical": False} + ) + assert_frame_equal(df, tables[0].df) + + +def test_stream_duplicated_text(): + df = pd.DataFrame(data_stream_duplicated_text) + + filename = os.path.join(testdir, "birdisland.pdf") + tables = camelot.read_pdf(filename, flavor="stream") + assert_frame_equal(df, tables[0].df)