216 lines
6.3 KiB
Python
216 lines
6.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import camelot
|
|
|
|
from .data import *
|
|
|
|
testdir = os.path.dirname(os.path.abspath(__file__))
|
|
testdir = os.path.join(testdir, "files")
|
|
|
|
|
|
def test_parsing_report():
|
|
parsing_report = {
|
|
'accuracy': 99.02,
|
|
'whitespace': 12.24,
|
|
'order': 1,
|
|
'page': 1
|
|
}
|
|
|
|
filename = os.path.join(testdir, "foo.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert tables[0].parsing_report == parsing_report
|
|
|
|
|
|
def test_password():
|
|
df = pd.DataFrame(data_stream)
|
|
|
|
filename = os.path.join(testdir, "health_protected.pdf")
|
|
tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
|
|
assert df.equals(tables[0].df)
|
|
|
|
tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream():
|
|
df = pd.DataFrame(data_stream)
|
|
|
|
filename = os.path.join(testdir, "health.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_table_rotated():
|
|
df = pd.DataFrame(data_stream_table_rotated)
|
|
|
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream")
|
|
assert df.equals(tables[0].df)
|
|
|
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_two_tables():
|
|
df1 = pd.DataFrame(data_stream_two_tables_1)
|
|
df2 = pd.DataFrame(data_stream_two_tables_2)
|
|
|
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
|
tables = camelot.read_pdf(filename, flavor='stream')
|
|
|
|
assert len(tables) == 2
|
|
assert df1.equals(tables[0].df)
|
|
assert df2.equals(tables[1].df)
|
|
|
|
|
|
def test_stream_table_areas():
|
|
df = pd.DataFrame(data_stream_table_areas)
|
|
|
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"])
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_columns():
|
|
df = pd.DataFrame(data_stream_columns)
|
|
|
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
|
tables = camelot.read_pdf(
|
|
filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10)
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_split_text():
|
|
df = pd.DataFrame(data_stream_split_text)
|
|
|
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
|
tables = camelot.read_pdf(
|
|
filename, flavor="stream", columns=["72,95,209,327,442,529,566,606,683"], split_text=True)
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_flag_size():
|
|
df = pd.DataFrame(data_stream_flag_size)
|
|
|
|
filename = os.path.join(testdir, "superscript.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_strip_text():
|
|
df = pd.DataFrame(data_stream_strip_text)
|
|
|
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream", strip_text="\n")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_edge_tol():
|
|
df = pd.DataFrame(data_stream_edge_tol)
|
|
|
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
|
tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_stream_layout_kwargs():
|
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
|
|
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
|
tables = camelot.read_pdf(
|
|
filename, flavor="stream", layout_kwargs={"detect_vertical": False})
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_lattice():
|
|
df = pd.DataFrame(data_lattice)
|
|
|
|
filename = os.path.join(
|
|
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf")
|
|
tables = camelot.read_pdf(filename, pages="2")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_lattice_table_rotated():
|
|
df = pd.DataFrame(data_lattice_table_rotated)
|
|
|
|
filename = os.path.join(testdir, "clockwise_table_1.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert df.equals(tables[0].df)
|
|
|
|
filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_lattice_two_tables():
|
|
df1 = pd.DataFrame(data_lattice_two_tables_1)
|
|
df2 = pd.DataFrame(data_lattice_two_tables_2)
|
|
|
|
filename = os.path.join(testdir, "twotables_2.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert len(tables) == 2
|
|
assert df1.equals(tables[0].df)
|
|
assert df2.equals(tables[1].df)
|
|
|
|
|
|
def test_lattice_table_areas():
|
|
df = pd.DataFrame(data_lattice_table_areas)
|
|
|
|
filename = os.path.join(testdir, "twotables_2.pdf")
|
|
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_lattice_process_background():
|
|
df = pd.DataFrame(data_lattice_process_background)
|
|
|
|
filename = os.path.join(testdir, "background_lines_1.pdf")
|
|
tables = camelot.read_pdf(filename, process_background=True)
|
|
assert df.equals(tables[1].df)
|
|
|
|
|
|
def test_lattice_copy_text():
|
|
df = pd.DataFrame(data_lattice_copy_text)
|
|
|
|
filename = os.path.join(testdir, "row_span_1.pdf")
|
|
tables = camelot.read_pdf(filename, line_size_scaling=60, copy_text="v")
|
|
assert df.equals(tables[0].df)
|
|
|
|
|
|
def test_lattice_shift_text():
|
|
df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
|
|
df_disable = pd.DataFrame(data_lattice_shift_text_disable)
|
|
df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
|
|
|
|
filename = os.path.join(testdir, "column_span_2.pdf")
|
|
tables = camelot.read_pdf(filename, line_size_scaling=40)
|
|
assert df_lt.equals(tables[0].df)
|
|
|
|
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=[''])
|
|
assert df_disable.equals(tables[0].df)
|
|
|
|
tables = camelot.read_pdf(filename, line_size_scaling=40, shift_text=['r', 'b'])
|
|
assert df_rb.equals(tables[0].df)
|
|
|
|
|
|
def test_repr():
|
|
filename = os.path.join(testdir, "foo.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert repr(tables) == "<TableList n=1>"
|
|
assert repr(tables[0]) == "<Table shape=(7, 7)>"
|
|
assert repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
|
|
|
|
|
|
def test_arabic():
|
|
df = pd.DataFrame(data_arabic)
|
|
|
|
filename = os.path.join(testdir, "tabula/arabic.pdf")
|
|
tables = camelot.read_pdf(filename)
|
|
assert df.equals(tables[0].df)
|