camelot-py/tests/test_common.py

519 lines
15 KiB
Python

# -*- coding: utf-8 -*-
import os
import pandas as pd
from pandas.testing import assert_frame_equal
import camelot
from camelot.core import Table, TableList
from camelot.__version__ import generate_version
# compare_tables used in console mode while debugging
from camelot.utils import compare_tables # noqa
from .data import *
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")
def test_parsing_report():
parsing_report = {
"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1
}
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
assert tables[0].parsing_report == parsing_report
def test_password():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health_protected.pdf")
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_stream():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert_frame_equal(df, tables[0].df)
def test_stream_table_rotated():
df = pd.DataFrame(data_stream_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert_frame_equal(df, tables[0].df)
def test_stream_two_tables():
df1 = pd.DataFrame(data_stream_two_tables_1)
df2 = pd.DataFrame(data_stream_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="stream")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_stream_table_regions():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="stream", table_regions=["320,460,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_stream_table_areas():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="stream", table_areas=["320,500,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_stream_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
)
assert_frame_equal(df, tables[0].df)
def test_stream_split_text():
df = pd.DataFrame(data_stream_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="stream",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
def test_stream_flag_size():
df = pd.DataFrame(data_stream_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
assert_frame_equal(df, tables[0].df)
def test_stream_strip_text():
df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)
def test_stream_edge_tol():
df = pd.DataFrame(data_stream_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
assert_frame_equal(df, tables[0].df)
def test_stream_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename, flavor="stream", layout_kwargs={"detect_vertical": False}
)
assert_frame_equal(df, tables[0].df)
def test_network():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_network_table_rotated():
df = pd.DataFrame(data_network_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_network_two_tables_a():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
# Reported as https://github.com/camelot-dev/camelot/issues/132
def test_network_two_tables_b():
df1 = pd.DataFrame(data_network_two_tables_b_1)
df2 = pd.DataFrame(data_network_two_tables_b_2)
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_network_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_network_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_network_table_regions():
df = pd.DataFrame(data_network_table_regions)
filename = os.path.join(testdir, "tabula/us-007.pdf")
# The "stream" test looks for a region in ["320,460,573,335"], which
# should exclude the header.
tables = camelot.read_pdf(
filename, flavor="network", table_regions=["320,335,573,505"]
)
assert_frame_equal(df, tables[0].df)
def test_network_table_areas():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="network", table_areas=["320,500,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_network_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
)
assert_frame_equal(df, tables[0].df)
def test_network_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="network",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
def test_network_flag_size():
df = pd.DataFrame(data_network_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
assert_frame_equal(df, tables[0].df)
def test_network_strip_text():
df = pd.DataFrame(data_network_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)
def test_network_edge_tol():
df = pd.DataFrame(data_network_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
assert_frame_equal(df, tables[0].df)
def test_network_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename, flavor="network", layout_kwargs={"detect_vertical": False}
)
assert_frame_equal(df, tables[0].df)
# Hybrid parser
def test_hybrid():
df = pd.DataFrame(data_hybrid)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_hybrid_process_background():
df = pd.DataFrame(data_hybrid_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", process_background=True)
assert_frame_equal(df, tables[1].df)
def test_hybrid_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="hybrid",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
# Lattice parser tests
def test_lattice():
df = pd.DataFrame(data_lattice)
filename = os.path.join(
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
)
tables = camelot.read_pdf(filename, pages="2")
assert_frame_equal(df, tables[0].df)
def test_lattice_table_rotated():
df = pd.DataFrame(data_lattice_table_rotated)
filename = os.path.join(testdir, "clockwise_table_1.pdf")
tables = camelot.read_pdf(filename)
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
tables = camelot.read_pdf(filename)
assert_frame_equal(df, tables[0].df)
def test_lattice_two_tables():
df1 = pd.DataFrame(data_lattice_two_tables_1)
df2 = pd.DataFrame(data_lattice_two_tables_2)
filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename)
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_lattice_table_regions():
df = pd.DataFrame(data_lattice_table_regions)
filename = os.path.join(testdir, "table_region.pdf")
tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
assert_frame_equal(df, tables[0].df)
def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas)
filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
assert_frame_equal(df, tables[0].df)
def test_lattice_process_background():
df = pd.DataFrame(data_lattice_process_background)
filename = os.path.join(testdir, "background_lines_1.pdf")
tables = camelot.read_pdf(filename, process_background=True)
assert_frame_equal(df, tables[1].df)
def test_lattice_copy_text():
df = pd.DataFrame(data_lattice_copy_text)
filename = os.path.join(testdir, "row_span_1.pdf")
tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
assert_frame_equal(df, tables[0].df)
def test_lattice_shift_text():
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
filename = os.path.join(testdir, "column_span_2.pdf")
tables = camelot.read_pdf(filename, line_scale=40)
assert df_lt.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
assert df_disable.equals(tables[0].df)
tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
assert df_rb.equals(tables[0].df)
def test_repr():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert \
repr(tables[0].cells[0][0]) == \
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
def test_pages():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert \
repr(tables[0].cells[0][0]) == \
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="1-end")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert \
repr(tables[0].cells[0][0]) == \
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
tables = camelot.read_pdf(url, pages="all")
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert (
repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
def test_url():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert (
repr(tables[0].cells[0][0]) ==
"<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
)
def test_arabic():
df = pd.DataFrame(data_arabic)
filename = os.path.join(testdir, "tabula/arabic.pdf")
tables = camelot.read_pdf(filename)
assert_frame_equal(df, tables[0].df)
def test_table_order():
def _make_table(page, order):
t = Table([], [])
t.page = page
t.order = order
return t
table_list = TableList(
[
_make_table(2, 1),
_make_table(1, 1),
_make_table(3, 4),
_make_table(1, 2)
]
)
assert [(t.page, t.order) for t in sorted(table_list)] == [
(1, 1),
(1, 2),
(2, 1),
(3, 4),
]
assert [(t.page, t.order) for t in sorted(table_list, reverse=True)] == [
(3, 4),
(2, 1),
(1, 2),
(1, 1),
]
def test_version_generation():
version = (0, 7, 3)
assert generate_version(version, prerelease=None, revision=None) == "0.7.3"
def test_version_generation_with_prerelease_revision():
version = (0, 7, 3)
prerelease = "alpha"
revision = 2
assert (
generate_version(version, prerelease=prerelease, revision=revision)
== "0.7.3-alpha.2"
)