Rename WIP parser "network", actual Hybrid to come
|
|
@ -313,7 +313,7 @@ def stream(c, *args, **kwargs):
|
||||||
tables.export(output, f=f, compress=compress)
|
tables.export(output, f=f, compress=compress)
|
||||||
|
|
||||||
|
|
||||||
@cli.command("hybrid")
|
@cli.command("network")
|
||||||
@click.option(
|
@click.option(
|
||||||
"-R",
|
"-R",
|
||||||
"--table_regions",
|
"--table_regions",
|
||||||
|
|
@ -365,7 +365,7 @@ def stream(c, *args, **kwargs):
|
||||||
)
|
)
|
||||||
@click.argument("filepath", type=click.Path(exists=True))
|
@click.argument("filepath", type=click.Path(exists=True))
|
||||||
@pass_config
|
@pass_config
|
||||||
def hybrid(c, *args, **kwargs):
|
def network(c, *args, **kwargs):
|
||||||
"""Use spaces between text to parse the table."""
|
"""Use spaces between text to parse the table."""
|
||||||
conf = c.config
|
conf = c.config
|
||||||
pages = conf.pop("pages")
|
pages = conf.pop("pages")
|
||||||
|
|
@ -396,7 +396,7 @@ def hybrid(c, *args, **kwargs):
|
||||||
"Please specify output file format using --format")
|
"Please specify output file format using --format")
|
||||||
|
|
||||||
tables = read_pdf(
|
tables = read_pdf(
|
||||||
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
|
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
|
||||||
)
|
)
|
||||||
click.echo("Found {} tables".format(tables.n))
|
click.echo("Found {} tables".format(tables.n))
|
||||||
if plot_type is not None:
|
if plot_type is not None:
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import logging
|
||||||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||||||
|
|
||||||
from .core import TableList
|
from .core import TableList
|
||||||
from .parsers import Stream, Lattice, Hybrid
|
from .parsers import Stream, Lattice, Network
|
||||||
from .utils import (
|
from .utils import (
|
||||||
build_file_path_in_temp_dir,
|
build_file_path_in_temp_dir,
|
||||||
get_page_layout,
|
get_page_layout,
|
||||||
|
|
@ -22,7 +22,7 @@ logger = logging.getLogger("camelot")
|
||||||
PARSERS = {
|
PARSERS = {
|
||||||
"lattice": Lattice,
|
"lattice": Lattice,
|
||||||
"stream": Stream,
|
"stream": Stream,
|
||||||
"hybrid": Hybrid,
|
"network": Network,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -177,7 +177,7 @@ class PDFHandler():
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
flavor : str (default: 'lattice')
|
flavor : str (default: 'lattice')
|
||||||
The parsing method to use ('lattice', 'stream', or 'hybrid').
|
The parsing method to use ('lattice', 'stream', or 'network').
|
||||||
Lattice is used by default.
|
Lattice is used by default.
|
||||||
suppress_stdout : str (default: False)
|
suppress_stdout : str (default: False)
|
||||||
Suppress logs and warnings.
|
Suppress logs and warnings.
|
||||||
|
|
|
||||||
|
|
@ -99,10 +99,10 @@ def read_pdf(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
layout_kwargs = layout_kwargs or {}
|
layout_kwargs = layout_kwargs or {}
|
||||||
if flavor not in ["lattice", "stream", "hybrid"]:
|
if flavor not in ["lattice", "stream", "network"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Unknown flavor specified."
|
"Unknown flavor specified."
|
||||||
" Use either 'lattice', 'stream', or 'hybrid'"
|
" Use either 'lattice', 'stream', or 'network'"
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
from .stream import Stream
|
from .stream import Stream
|
||||||
from .lattice import Lattice
|
from .lattice import Lattice
|
||||||
from .hybrid import Hybrid
|
from .network import Network
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""Implementation of hybrid table parser."""
|
"""Implementation of network table parser."""
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
|
|
@ -391,7 +391,7 @@ class TextNetworks(TextAlignments):
|
||||||
return gaps_hv
|
return gaps_hv
|
||||||
|
|
||||||
def search_table_body(self, gaps_hv, parse_details=None):
|
def search_table_body(self, gaps_hv, parse_details=None):
|
||||||
""" Build a candidate bbox for the body of a table using hybrid algo
|
""" Build a candidate bbox for the body of a table using network algo
|
||||||
|
|
||||||
Seed the process with the textline with the highest alignment
|
Seed the process with the textline with the highest alignment
|
||||||
score, then expand the bbox with textlines within threshold.
|
score, then expand the bbox with textlines within threshold.
|
||||||
|
|
@ -496,7 +496,7 @@ class TextNetworks(TextAlignments):
|
||||||
if textline in new_tls:
|
if textline in new_tls:
|
||||||
del tls_search_space[i]
|
del tls_search_space[i]
|
||||||
|
|
||||||
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
|
if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
|
||||||
return bbox
|
return bbox
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -508,8 +508,8 @@ class TextNetworks(TextAlignments):
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
|
|
||||||
class Hybrid(TextBaseParser):
|
class Network(TextBaseParser):
|
||||||
"""Hybrid method of parsing looks for spaces between text
|
"""Network method of parsing looks for spaces between text
|
||||||
to parse the table.
|
to parse the table.
|
||||||
|
|
||||||
If you want to specify columns when specifying multiple table
|
If you want to specify columns when specifying multiple table
|
||||||
|
|
@ -561,7 +561,7 @@ class Hybrid(TextBaseParser):
|
||||||
debug=False,
|
debug=False,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
"hybrid",
|
"network",
|
||||||
table_regions=table_regions,
|
table_regions=table_regions,
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
columns=columns,
|
columns=columns,
|
||||||
|
|
@ -203,7 +203,7 @@ class PlotMethods():
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Lattice flavor does not support kind='{}'".format(kind)
|
"Lattice flavor does not support kind='{}'".format(kind)
|
||||||
)
|
)
|
||||||
if table.flavor in ["stream", "hybrid"] and kind in ["line"]:
|
if table.flavor in ["stream", "network"] and kind in ["line"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Stream flavor does not support kind='{}'".format(kind)
|
"Stream flavor does not support kind='{}'".format(kind)
|
||||||
)
|
)
|
||||||
|
|
@ -313,7 +313,7 @@ class PlotMethods():
|
||||||
ax = prepare_plot(table, ax)
|
ax = prepare_plot(table, ax)
|
||||||
draw_text(table, ax)
|
draw_text(table, ax)
|
||||||
|
|
||||||
if table.flavor == "hybrid":
|
if table.flavor == "network":
|
||||||
for network in table.parse_details["network_searches"]:
|
for network in table.parse_details["network_searches"]:
|
||||||
most_connected_tl = network.most_connected_textline()
|
most_connected_tl = network.most_connected_textline()
|
||||||
|
|
||||||
|
|
@ -428,8 +428,8 @@ class PlotMethods():
|
||||||
return ax.get_figure()
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def hybrid_table_search(table, ax=None):
|
def network_table_search(table, ax=None):
|
||||||
"""Generates a plot illustrating the steps of the hybrid table search.
|
"""Generates a plot illustrating the steps of the network table search.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
|
||||||
|
|
@ -824,8 +824,8 @@ data_stream_table_rotated = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# The streaming algorithm incorrectly includes a header in the result.
|
# The streaming algorithm incorrectly includes a header in the result.
|
||||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
# Trimming the table for the test of network, which doesn't include it.
|
||||||
data_hybrid_table_rotated = data_stream_table_rotated[1:]
|
data_network_table_rotated = data_stream_table_rotated[1:]
|
||||||
|
|
||||||
data_stream_two_tables_1 = [
|
data_stream_two_tables_1 = [
|
||||||
[
|
[
|
||||||
|
|
@ -1298,8 +1298,8 @@ data_stream_two_tables_1 = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# The streaming algorithm incorrectly includes a header and a footer.
|
# The streaming algorithm incorrectly includes a header and a footer.
|
||||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
# Trimming the table for the test of network, which doesn't include it.
|
||||||
data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1]
|
data_network_two_tables_1 = data_stream_two_tables_1[3:-1]
|
||||||
|
|
||||||
data_stream_two_tables_2 = [
|
data_stream_two_tables_2 = [
|
||||||
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
|
||||||
|
|
@ -1609,7 +1609,7 @@ data_stream_two_tables_2 = [
|
||||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
data_hybrid_two_tables_b_1 = [
|
data_network_two_tables_b_1 = [
|
||||||
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
|
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
|
||||||
["Vgvhgh", "Hj", "Hj", "Hj"],
|
["Vgvhgh", "Hj", "Hj", "Hj"],
|
||||||
["Hj", "Hj", "Hj", "Hj"],
|
["Hj", "Hj", "Hj", "Hj"],
|
||||||
|
|
@ -1619,17 +1619,17 @@ data_hybrid_two_tables_b_1 = [
|
||||||
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
|
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
|
||||||
]
|
]
|
||||||
|
|
||||||
data_hybrid_two_tables_b_2 = [
|
data_network_two_tables_b_2 = [
|
||||||
["Trtrt", "H", "Gh"],
|
["Trtrt", "H", "Gh"],
|
||||||
["Gh", "V", "Hv"],
|
["Gh", "V", "Hv"],
|
||||||
["Hv", "Bhjb", "hg"],
|
["Hv", "Bhjb", "hg"],
|
||||||
]
|
]
|
||||||
|
|
||||||
# The streaming algorithm incorrectly includes a header and a footer.
|
# The streaming algorithm incorrectly includes a header and a footer.
|
||||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
# Trimming the table for the test of network, which doesn't include it.
|
||||||
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
data_network_two_tables_2 = data_stream_two_tables_2[3:-1]
|
||||||
|
|
||||||
data_hybrid_vertical_headers = [
|
data_network_vertical_headers = [
|
||||||
[
|
[
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
|
@ -2090,8 +2090,8 @@ data_stream_table_areas = [
|
||||||
["(each day of the payroll period)", ""],
|
["(each day of the payroll period)", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
# Hybrid doesn't recognize the footer as belonging to the table.
|
# Network doesn't recognize the footer as belonging to the table.
|
||||||
data_hybrid_table_regions = data_stream_table_areas[:-1]
|
data_network_table_regions = data_stream_table_areas[:-1]
|
||||||
|
|
||||||
data_stream_columns = [
|
data_stream_columns = [
|
||||||
[
|
[
|
||||||
|
|
@ -2613,9 +2613,9 @@ data_stream_split_text = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# The stream algorithm excludes the string "Alphabetic Listing by type"
|
# The stream algorithm excludes the string "Alphabetic Listing by type"
|
||||||
data_hybrid_split_text = []
|
data_network_split_text = []
|
||||||
data_hybrid_split_text.extend(data_stream_split_text)
|
data_network_split_text.extend(data_stream_split_text)
|
||||||
data_hybrid_split_text[0] = [
|
data_network_split_text[0] = [
|
||||||
'FEB', 'RUAR', 'Y 2014 M27 (BUS)', '',
|
'FEB', 'RUAR', 'Y 2014 M27 (BUS)', '',
|
||||||
'ALPHABETIC LISTING BY T', 'YPE', '', '', '', 'ABLPDM27'
|
'ALPHABETIC LISTING BY T', 'YPE', '', '', '', 'ABLPDM27'
|
||||||
]
|
]
|
||||||
|
|
@ -2851,15 +2851,15 @@ data_stream_flag_size = [
|
||||||
],
|
],
|
||||||
]
|
]
|
||||||
|
|
||||||
# Hybrid adds more content into the header.
|
# Network adds more content into the header.
|
||||||
data_hybrid_flag_size = [
|
data_network_flag_size = [
|
||||||
['', '', '', '', '(As at end-March)', '', '', '', '', '', ''],
|
['', '', '', '', '(As at end-March)', '', '', '', '', '', ''],
|
||||||
['', '', '', '', '', '', '', '', '', '', '(` Billion)']
|
['', '', '', '', '', '', '', '', '', '', '(` Billion)']
|
||||||
]
|
]
|
||||||
data_hybrid_flag_size.extend(data_stream_flag_size)
|
data_network_flag_size.extend(data_stream_flag_size)
|
||||||
|
|
||||||
|
|
||||||
data_hybrid_strip_text = [
|
data_network_strip_text = [
|
||||||
["VinsauVerre", ""],
|
["VinsauVerre", ""],
|
||||||
["LesBlancs", "12.5CL"],
|
["LesBlancs", "12.5CL"],
|
||||||
["A.O.PCôtesduRhône", ""],
|
["A.O.PCôtesduRhône", ""],
|
||||||
|
|
@ -2900,7 +2900,7 @@ data_hybrid_strip_text = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# Stream only detects part of the table
|
# Stream only detects part of the table
|
||||||
data_stream_strip_text = data_hybrid_strip_text[0:-13]
|
data_stream_strip_text = data_network_strip_text[0:-13]
|
||||||
|
|
||||||
data_stream_edge_tol = [
|
data_stream_edge_tol = [
|
||||||
["Key figures", ""],
|
["Key figures", ""],
|
||||||
|
|
@ -2940,9 +2940,9 @@ data_stream_edge_tol = [
|
||||||
["period.", ""],
|
["period.", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
# The stream algorithm ends up including a footer, which hybrid correctly
|
# The stream algorithm ends up including a footer, which network correctly
|
||||||
# skips.
|
# skips.
|
||||||
data_hybrid_edge_tol = data_stream_edge_tol[:-3]
|
data_network_edge_tol = data_stream_edge_tol[:-3]
|
||||||
|
|
||||||
data_lattice = [
|
data_lattice = [
|
||||||
[
|
[
|
||||||
|
|
|
||||||
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 103 KiB |
|
Before Width: | Height: | Size: 48 KiB After Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 88 KiB After Width: | Height: | Size: 88 KiB |
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 90 KiB |
|
Before Width: | Height: | Size: 102 KiB After Width: | Height: | Size: 102 KiB |
|
|
@ -72,22 +72,22 @@ def test_cli_stream():
|
||||||
assert format_error in result.output
|
assert format_error in result.output
|
||||||
|
|
||||||
|
|
||||||
def test_cli_hybrid():
|
def test_cli_network():
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
infile = os.path.join(testdir, "budget.pdf")
|
infile = os.path.join(testdir, "budget.pdf")
|
||||||
outfile = os.path.join(tempdir, "budget.csv")
|
outfile = os.path.join(tempdir, "budget.csv")
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
result = runner.invoke(
|
result = runner.invoke(
|
||||||
cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
|
cli, ["--format", "csv", "--output", outfile, "network", infile]
|
||||||
)
|
)
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert result.output == "Found 1 tables\n"
|
assert result.output == "Found 1 tables\n"
|
||||||
|
|
||||||
result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
|
result = runner.invoke(cli, ["--format", "csv", "network", infile])
|
||||||
output_error = "Error: Please specify output file path using --output"
|
output_error = "Error: Please specify output file path using --output"
|
||||||
assert output_error in result.output
|
assert output_error in result.output
|
||||||
|
|
||||||
result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
|
result = runner.invoke(cli, ["--output", outfile, "network", infile])
|
||||||
format_error = "Please specify output file format using --format"
|
format_error = "Please specify output file format using --format"
|
||||||
assert format_error in result.output
|
assert format_error in result.output
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -150,32 +150,32 @@ def test_stream_layout_kwargs():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid():
|
def test_network():
|
||||||
df = pd.DataFrame(data_stream)
|
df = pd.DataFrame(data_stream)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "health.pdf")
|
filename = os.path.join(testdir, "health.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_table_rotated():
|
def test_network_table_rotated():
|
||||||
df = pd.DataFrame(data_hybrid_table_rotated)
|
df = pd.DataFrame(data_network_table_rotated)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
filename = os.path.join(testdir, "clockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_two_tables_a():
|
def test_network_two_tables_a():
|
||||||
df1 = pd.DataFrame(data_hybrid_two_tables_1)
|
df1 = pd.DataFrame(data_network_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_hybrid_two_tables_2)
|
df2 = pd.DataFrame(data_network_two_tables_2)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
|
||||||
assert len(tables) == 2
|
assert len(tables) == 2
|
||||||
assert df1.equals(tables[0].df)
|
assert df1.equals(tables[0].df)
|
||||||
|
|
@ -183,104 +183,104 @@ def test_hybrid_two_tables_a():
|
||||||
|
|
||||||
|
|
||||||
# Reported as https://github.com/camelot-dev/camelot/issues/132
|
# Reported as https://github.com/camelot-dev/camelot/issues/132
|
||||||
def test_hybrid_two_tables_b():
|
def test_network_two_tables_b():
|
||||||
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
|
df1 = pd.DataFrame(data_network_two_tables_b_1)
|
||||||
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
|
df2 = pd.DataFrame(data_network_two_tables_b_2)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
|
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
|
|
||||||
assert len(tables) == 2
|
assert len(tables) == 2
|
||||||
assert df1.equals(tables[0].df)
|
assert df1.equals(tables[0].df)
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_vertical_header():
|
def test_network_vertical_header():
|
||||||
"""Tests a complex table with a vertically text header.
|
"""Tests a complex table with a vertically text header.
|
||||||
"""
|
"""
|
||||||
df = pd.DataFrame(data_hybrid_vertical_headers)
|
df = pd.DataFrame(data_network_vertical_headers)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "vertical_header.pdf")
|
filename = os.path.join(testdir, "vertical_header.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
assert len(tables) == 1
|
assert len(tables) == 1
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_table_regions():
|
def test_network_table_regions():
|
||||||
df = pd.DataFrame(data_hybrid_table_regions)
|
df = pd.DataFrame(data_network_table_regions)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
# The "stream" test looks for a region in ["320,460,573,335"], which
|
# The "stream" test looks for a region in ["320,460,573,335"], which
|
||||||
# should exclude the header.
|
# should exclude the header.
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", table_regions=["320,335,573,505"]
|
filename, flavor="network", table_regions=["320,335,573,505"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_table_areas():
|
def test_network_table_areas():
|
||||||
df = pd.DataFrame(data_stream_table_areas)
|
df = pd.DataFrame(data_stream_table_areas)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", table_areas=["320,500,573,335"]
|
filename, flavor="network", table_areas=["320,500,573,335"]
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_columns():
|
def test_network_columns():
|
||||||
df = pd.DataFrame(data_stream_columns)
|
df = pd.DataFrame(data_stream_columns)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "mexican_towns.pdf")
|
filename = os.path.join(testdir, "mexican_towns.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
|
filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_split_text():
|
def test_network_split_text():
|
||||||
df = pd.DataFrame(data_hybrid_split_text)
|
df = pd.DataFrame(data_network_split_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "tabula/m27.pdf")
|
filename = os.path.join(testdir, "tabula/m27.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename,
|
filename,
|
||||||
flavor="hybrid",
|
flavor="network",
|
||||||
columns=["72,95,209,327,442,529,566,606,683"],
|
columns=["72,95,209,327,442,529,566,606,683"],
|
||||||
split_text=True,
|
split_text=True,
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_flag_size():
|
def test_network_flag_size():
|
||||||
df = pd.DataFrame(data_hybrid_flag_size)
|
df = pd.DataFrame(data_network_flag_size)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "superscript.pdf")
|
filename = os.path.join(testdir, "superscript.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
|
tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_strip_text():
|
def test_network_strip_text():
|
||||||
df = pd.DataFrame(data_hybrid_strip_text)
|
df = pd.DataFrame(data_network_strip_text)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
|
tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_edge_tol():
|
def test_network_edge_tol():
|
||||||
df = pd.DataFrame(data_hybrid_edge_tol)
|
df = pd.DataFrame(data_network_edge_tol)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "edge_tol.pdf")
|
filename = os.path.join(testdir, "edge_tol.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
|
tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_layout_kwargs():
|
def test_network_layout_kwargs():
|
||||||
df = pd.DataFrame(data_stream_layout_kwargs)
|
df = pd.DataFrame(data_stream_layout_kwargs)
|
||||||
|
|
||||||
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
filename = os.path.join(testdir, "detect_vertical_false.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
|
filename, flavor="network", layout_kwargs={"detect_vertical": False}
|
||||||
)
|
)
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ filename = os.path.join(testdir, 'foo.pdf')
|
||||||
|
|
||||||
def test_unknown_flavor():
|
def test_unknown_flavor():
|
||||||
message = ("Unknown flavor specified."
|
message = ("Unknown flavor specified."
|
||||||
" Use either 'lattice', 'stream', or 'hybrid'")
|
" Use either 'lattice', 'stream', or 'network'")
|
||||||
with pytest.raises(NotImplementedError, match=message):
|
with pytest.raises(NotImplementedError, match=message):
|
||||||
camelot.read_pdf(filename, flavor='chocolate')
|
camelot.read_pdf(filename, flavor='chocolate')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,9 +62,9 @@ def test_stream_grid_plot():
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_hybrid_grid_plot():
|
def test_network_grid_plot():
|
||||||
filename = os.path.join(testdir, "foo.pdf")
|
filename = os.path.join(testdir, "foo.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
tables = camelot.read_pdf(filename, flavor="network")
|
||||||
return unit_test_stable_plot(tables[0], 'grid')
|
return unit_test_stable_plot(tables[0], 'grid')
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -86,9 +86,9 @@ def test_stream_contour_plot():
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_hybrid_contour_plot():
|
def test_network_contour_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, flavor='hybrid')
|
tables = camelot.read_pdf(filename, flavor='network')
|
||||||
return unit_test_stable_plot(tables[0], 'contour')
|
return unit_test_stable_plot(tables[0], 'contour')
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -118,18 +118,18 @@ def test_stream_textedge_plot():
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_hybrid_textedge_plot():
|
def test_network_textedge_plot():
|
||||||
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
filename = os.path.join(testdir, "tabula/12s0324.pdf")
|
||||||
tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
|
tables = camelot.read_pdf(filename, debug=True, flavor='network')
|
||||||
return unit_test_stable_plot(tables[0], 'textedge')
|
return unit_test_stable_plot(tables[0], 'textedge')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_hybrid_table_regions_textedge_plot():
|
def test_network_table_regions_textedge_plot():
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, debug=True, flavor="hybrid",
|
filename, debug=True, flavor="network",
|
||||||
table_regions=["320,505,573,330"]
|
table_regions=["320,505,573,330"]
|
||||||
)
|
)
|
||||||
return unit_test_stable_plot(tables[0], 'textedge')
|
return unit_test_stable_plot(tables[0], 'textedge')
|
||||||
|
|
@ -137,10 +137,10 @@ def test_hybrid_table_regions_textedge_plot():
|
||||||
|
|
||||||
@pytest.mark.mpl_image_compare(
|
@pytest.mark.mpl_image_compare(
|
||||||
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
|
||||||
def test_hybrid_table_areas_text_plot():
|
def test_network_table_areas_text_plot():
|
||||||
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
filename = os.path.join(testdir, "tabula/us-007.pdf")
|
||||||
tables = camelot.read_pdf(
|
tables = camelot.read_pdf(
|
||||||
filename, debug=True, flavor="hybrid",
|
filename, debug=True, flavor="network",
|
||||||
table_areas=["320,500,573,335"]
|
table_areas=["320,500,573,335"]
|
||||||
)
|
)
|
||||||
return unit_test_stable_plot(tables[0], 'text')
|
return unit_test_stable_plot(tables[0], 'text')
|
||||||
|
|
|
||||||