Split tests for lattice and stream, and fix test_common reprs
parent
3ddc02b2f2
commit
36dcfe99d8
|
|
@ -23,7 +23,9 @@ class ImageConversionBackend(object):
|
|||
converter = backends[self.backend]()
|
||||
converter.convert(pdf_path, png_path)
|
||||
except Exception as e:
|
||||
logger.info(f"Image conversion backend '{self.backend}' failed with '{str(e)}'")
|
||||
logger.info(
|
||||
f"Image conversion backend '{self.backend}' failed with '{str(e)}'"
|
||||
)
|
||||
|
||||
if self.use_fallback:
|
||||
for fallback in self.fallbacks:
|
||||
|
|
@ -33,7 +35,9 @@ class ImageConversionBackend(object):
|
|||
converter = backends[fallback]()
|
||||
converter.convert(pdf_path, png_path)
|
||||
except Exception as e:
|
||||
logger.info(f"Image conversion backend '{fallback}' failed with '{str(e)}'")
|
||||
logger.info(
|
||||
f"Image conversion backend '{fallback}' failed with '{str(e)}'"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
logger.info(f"Image conversion backend '{fallback}' succeeded")
|
||||
|
|
|
|||
|
|
@ -288,10 +288,10 @@ class Cell(object):
|
|||
self._text = ""
|
||||
|
||||
def __repr__(self):
|
||||
x1 = round(self.x1, 2)
|
||||
y1 = round(self.y1, 2)
|
||||
x2 = round(self.x2, 2)
|
||||
y2 = round(self.y2, 2)
|
||||
x1 = round(self.x1)
|
||||
y1 = round(self.y1)
|
||||
x2 = round(self.x2)
|
||||
y2 = round(self.y2)
|
||||
return f"<Cell x1={x1} y1={y1} x2={x2} y2={y2}>"
|
||||
|
||||
@property
|
||||
|
|
|
|||
|
|
@ -112,6 +112,7 @@ class Lattice(BaseParser):
|
|||
threshold_constant=-2,
|
||||
iterations=0,
|
||||
resolution=300,
|
||||
backend=ImageConversionBackend(),
|
||||
**kwargs,
|
||||
):
|
||||
self.table_regions = table_regions
|
||||
|
|
@ -129,7 +130,7 @@ class Lattice(BaseParser):
|
|||
self.threshold_constant = threshold_constant
|
||||
self.iterations = iterations
|
||||
self.resolution = resolution
|
||||
self.backend = ImageConversionBackend()
|
||||
self.backend = backend
|
||||
|
||||
@staticmethod
|
||||
def _reduce_index(t, idx, shift_text):
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from pandas.testing import assert_frame_equal
|
|||
import camelot
|
||||
from camelot.core import Table, TableList
|
||||
from camelot.__version__ import generate_version
|
||||
from camelot.backends import ImageConversionBackend
|
||||
|
||||
from .data import *
|
||||
|
||||
|
|
@ -15,6 +16,21 @@ testdir = os.path.dirname(os.path.abspath(__file__))
|
|||
testdir = os.path.join(testdir, "files")
|
||||
|
||||
|
||||
def test_version_generation():
|
||||
version = (0, 7, 3)
|
||||
assert generate_version(version, prerelease=None, revision=None) == "0.7.3"
|
||||
|
||||
|
||||
def test_version_generation_with_prerelease_revision():
|
||||
version = (0, 7, 3)
|
||||
prerelease = "alpha"
|
||||
revision = 2
|
||||
assert (
|
||||
generate_version(version, prerelease=prerelease, revision=revision)
|
||||
== "0.7.3-alpha.2"
|
||||
)
|
||||
|
||||
|
||||
def test_parsing_report():
|
||||
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
|
||||
|
||||
|
|
@ -34,246 +50,92 @@ def test_password():
|
|||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream():
|
||||
df = pd.DataFrame(data_stream)
|
||||
|
||||
filename = os.path.join(testdir, "health.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_table_rotated():
|
||||
df = pd.DataFrame(data_stream_table_rotated)
|
||||
|
||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_two_tables():
|
||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_stream_table_regions():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", table_regions=["320,460,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_table_areas():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", table_areas=["320,500,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_columns():
|
||||
df = pd.DataFrame(data_stream_columns)
|
||||
|
||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_split_text():
|
||||
df = pd.DataFrame(data_stream_split_text)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename,
|
||||
flavor="stream",
|
||||
columns=["72,95,209,327,442,529,566,606,683"],
|
||||
split_text=True,
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_flag_size():
|
||||
df = pd.DataFrame(data_stream_flag_size)
|
||||
|
||||
filename = os.path.join(testdir, "superscript.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_strip_text():
|
||||
df = pd.DataFrame(data_stream_strip_text)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_edge_tol():
|
||||
df = pd.DataFrame(data_stream_edge_tol)
|
||||
|
||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_layout_kwargs():
|
||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", layout_kwargs={"detect_vertical": False}
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
||||
filename = os.path.join(
|
||||
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
|
||||
)
|
||||
tables = camelot.read_pdf(filename, pages="2")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_table_rotated():
|
||||
df = pd.DataFrame(data_lattice_table_rotated)
|
||||
|
||||
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_two_tables():
|
||||
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
||||
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
||||
|
||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_lattice_table_regions():
|
||||
df = pd.DataFrame(data_lattice_table_regions)
|
||||
|
||||
filename = os.path.join(testdir, "table_region.pdf")
|
||||
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_table_areas():
|
||||
df = pd.DataFrame(data_lattice_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_process_background():
|
||||
df = pd.DataFrame(data_lattice_process_background)
|
||||
|
||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||
tables = camelot.read_pdf(filename, process_background=True)
|
||||
assert_frame_equal(df, tables[1].df)
|
||||
|
||||
|
||||
def test_lattice_copy_text():
|
||||
df = pd.DataFrame(data_lattice_copy_text)
|
||||
|
||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_shift_text():
|
||||
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
||||
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
||||
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
||||
|
||||
filename = os.path.join(testdir, "column_span_2.pdf")
|
||||
tables = camelot.read_pdf(filename, line_scale=40)
|
||||
assert df_lt.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
|
||||
assert df_disable.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
|
||||
assert df_rb.equals(tables[0].df)
|
||||
|
||||
|
||||
def test_repr():
|
||||
def test_repr_poppler():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_pages():
|
||||
def test_repr_ghostscript():
|
||||
filename = os.path.join(testdir, "foo.pdf")
|
||||
tables = camelot.read_pdf(filename, backend=ImageConversionBackend(backend="ghostscript"))
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_url_poppler():
|
||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||
tables = camelot.read_pdf(url)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_url_ghostscript():
|
||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||
tables = camelot.read_pdf(url, backend=ImageConversionBackend(backend="ghostscript"))
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_pages_poppler():
|
||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||
tables = camelot.read_pdf(url)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
tables = camelot.read_pdf(url, pages="1-end")
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
tables = camelot.read_pdf(url, pages="all")
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_url():
|
||||
def test_pages_ghostscript():
|
||||
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
|
||||
tables = camelot.read_pdf(url)
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
tables = camelot.read_pdf(url, pages="1-end")
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
def test_arabic():
|
||||
df = pd.DataFrame(data_arabic)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/arabic.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
tables = camelot.read_pdf(url, pages="all")
|
||||
assert repr(tables) == "<TableList n=1>"
|
||||
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
||||
assert (
|
||||
repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
|
||||
)
|
||||
|
||||
|
||||
def test_table_order():
|
||||
|
|
@ -299,26 +161,3 @@ def test_table_order():
|
|||
(1, 2),
|
||||
(1, 1),
|
||||
]
|
||||
|
||||
|
||||
def test_version_generation():
|
||||
version = (0, 7, 3)
|
||||
assert generate_version(version, prerelease=None, revision=None) == "0.7.3"
|
||||
|
||||
|
||||
def test_version_generation_with_prerelease_revision():
|
||||
version = (0, 7, 3)
|
||||
prerelease = "alpha"
|
||||
revision = 2
|
||||
assert (
|
||||
generate_version(version, prerelease=prerelease, revision=revision)
|
||||
== "0.7.3-alpha.2"
|
||||
)
|
||||
|
||||
|
||||
def test_stream_duplicated_text():
|
||||
df = pd.DataFrame(data_stream_duplicated_text)
|
||||
|
||||
filename = os.path.join(testdir, "birdisland.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,104 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
import camelot
|
||||
from camelot.core import Table, TableList
|
||||
from camelot.__version__ import generate_version
|
||||
|
||||
from .data import *
|
||||
|
||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||
testdir = os.path.join(testdir, "files")
|
||||
|
||||
|
||||
def test_lattice():
|
||||
df = pd.DataFrame(data_lattice)
|
||||
|
||||
filename = os.path.join(
|
||||
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
|
||||
)
|
||||
tables = camelot.read_pdf(filename, pages="2")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_table_rotated():
|
||||
df = pd.DataFrame(data_lattice_table_rotated)
|
||||
|
||||
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_two_tables():
|
||||
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
||||
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
||||
|
||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_lattice_table_regions():
|
||||
df = pd.DataFrame(data_lattice_table_regions)
|
||||
|
||||
filename = os.path.join(testdir, "table_region.pdf")
|
||||
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_table_areas():
|
||||
df = pd.DataFrame(data_lattice_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "twotables_2.pdf")
|
||||
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_process_background():
|
||||
df = pd.DataFrame(data_lattice_process_background)
|
||||
|
||||
filename = os.path.join(testdir, "background_lines_1.pdf")
|
||||
tables = camelot.read_pdf(filename, process_background=True)
|
||||
assert_frame_equal(df, tables[1].df)
|
||||
|
||||
|
||||
def test_lattice_copy_text():
|
||||
df = pd.DataFrame(data_lattice_copy_text)
|
||||
|
||||
filename = os.path.join(testdir, "row_span_1.pdf")
|
||||
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_shift_text():
|
||||
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
||||
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
||||
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
||||
|
||||
filename = os.path.join(testdir, "column_span_2.pdf")
|
||||
tables = camelot.read_pdf(filename, line_scale=40)
|
||||
assert df_lt.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
|
||||
assert df_disable.equals(tables[0].df)
|
||||
|
||||
tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
|
||||
assert df_rb.equals(tables[0].df)
|
||||
|
||||
|
||||
def test_lattice_arabic():
|
||||
df = pd.DataFrame(data_arabic)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/arabic.pdf")
|
||||
tables = camelot.read_pdf(filename)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
import camelot
|
||||
from camelot.core import Table, TableList
|
||||
from camelot.__version__ import generate_version
|
||||
|
||||
from .data import *
|
||||
|
||||
testdir = os.path.dirname(os.path.abspath(__file__))
|
||||
testdir = os.path.join(testdir, "files")
|
||||
|
||||
|
||||
def test_stream():
|
||||
df = pd.DataFrame(data_stream)
|
||||
|
||||
filename = os.path.join(testdir, "health.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_table_rotated():
|
||||
df = pd.DataFrame(data_stream_table_rotated)
|
||||
|
||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_two_tables():
|
||||
df1 = pd.DataFrame(data_stream_two_tables_1)
|
||||
df2 = pd.DataFrame(data_stream_two_tables_2)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_stream_table_regions():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", table_regions=["320,460,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_table_areas():
|
||||
df = pd.DataFrame(data_stream_table_areas)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", table_areas=["320,500,573,335"]
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_columns():
|
||||
df = pd.DataFrame(data_stream_columns)
|
||||
|
||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_split_text():
|
||||
df = pd.DataFrame(data_stream_split_text)
|
||||
|
||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename,
|
||||
flavor="stream",
|
||||
columns=["72,95,209,327,442,529,566,606,683"],
|
||||
split_text=True,
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_flag_size():
|
||||
df = pd.DataFrame(data_stream_flag_size)
|
||||
|
||||
filename = os.path.join(testdir, "superscript.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_strip_text():
|
||||
df = pd.DataFrame(data_stream_strip_text)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_edge_tol():
|
||||
df = pd.DataFrame(data_stream_edge_tol)
|
||||
|
||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_layout_kwargs():
|
||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||
|
||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||
tables = camelot.read_pdf(
|
||||
filename, flavor="stream", layout_kwargs={"detect_vertical": False}
|
||||
)
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_stream_duplicated_text():
|
||||
df = pd.DataFrame(data_stream_duplicated_text)
|
||||
|
||||
filename = os.path.join(testdir, "birdisland.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="stream")
|
||||
assert_frame_equal(df, tables[0].df)
|
||||
Loading…
Reference in New Issue