Rename WIP parser "network", actual Hybrid to come

pull/153/head
Frh 2020-05-02 16:14:03 -07:00
parent c7ab3a4c32
commit 6711f877bf
18 changed files with 120 additions and 118 deletions

View File

@ -313,7 +313,7 @@ def stream(c, *args, **kwargs):
tables.export(output, f=f, compress=compress)
@cli.command("hybrid")
@cli.command("network")
@click.option(
"-R",
"--table_regions",
@ -365,7 +365,7 @@ def stream(c, *args, **kwargs):
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def hybrid(c, *args, **kwargs):
def network(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
@ -396,7 +396,7 @@ def hybrid(c, *args, **kwargs):
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
filepath, pages=pages, flavor="network", suppress_stdout=quiet, **kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:

View File

@ -7,7 +7,7 @@ import logging
from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice, Hybrid
from .parsers import Stream, Lattice, Network
from .utils import (
build_file_path_in_temp_dir,
get_page_layout,
@ -22,7 +22,7 @@ logger = logging.getLogger("camelot")
PARSERS = {
"lattice": Lattice,
"stream": Stream,
"hybrid": Hybrid,
"network": Network,
}
@ -177,7 +177,7 @@ class PDFHandler():
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice', 'stream', or 'hybrid').
The parsing method to use ('lattice', 'stream', or 'network').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.

View File

@ -99,10 +99,10 @@ def read_pdf(
"""
layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream", "hybrid"]:
if flavor not in ["lattice", "stream", "network"]:
raise NotImplementedError(
"Unknown flavor specified."
" Use either 'lattice', 'stream', or 'hybrid'"
" Use either 'lattice', 'stream', or 'network'"
)
with warnings.catch_warnings():

View File

@ -2,4 +2,4 @@
from .stream import Stream
from .lattice import Lattice
from .hybrid import Hybrid
from .network import Network

View File

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
"""Implementation of hybrid table parser."""
"""Implementation of network table parser."""
from __future__ import division
@ -391,7 +391,7 @@ class TextNetworks(TextAlignments):
return gaps_hv
def search_table_body(self, gaps_hv, parse_details=None):
""" Build a candidate bbox for the body of a table using hybrid algo
""" Build a candidate bbox for the body of a table using network algo
Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold.
@ -496,7 +496,7 @@ class TextNetworks(TextAlignments):
if textline in new_tls:
del tls_search_space[i]
if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
if len(tls_in_bbox) >= MINIMUM_TEXTLINES_IN_TABLE:
return bbox
return None
@ -508,8 +508,8 @@ class TextNetworks(TextAlignments):
self._compute_alignment_counts()
class Hybrid(TextBaseParser):
"""Hybrid method of parsing looks for spaces between text
class Network(TextBaseParser):
"""Network method of parsing looks for spaces between text
to parse the table.
If you want to specify columns when specifying multiple table
@ -561,7 +561,7 @@ class Hybrid(TextBaseParser):
debug=False,
**kwargs):
super().__init__(
"hybrid",
"network",
table_regions=table_regions,
table_areas=table_areas,
columns=columns,

View File

@ -203,7 +203,7 @@ class PlotMethods():
raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind)
)
if table.flavor in ["stream", "hybrid"] and kind in ["line"]:
if table.flavor in ["stream", "network"] and kind in ["line"]:
raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind)
)
@ -313,7 +313,7 @@ class PlotMethods():
ax = prepare_plot(table, ax)
draw_text(table, ax)
if table.flavor == "hybrid":
if table.flavor == "network":
for network in table.parse_details["network_searches"]:
most_connected_tl = network.most_connected_textline()
@ -428,8 +428,8 @@ class PlotMethods():
return ax.get_figure()
@staticmethod
def hybrid_table_search(table, ax=None):
"""Generates a plot illustrating the steps of the hybrid table search.
def network_table_search(table, ax=None):
"""Generates a plot illustrating the steps of the network table search.
Parameters
----------

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -824,8 +824,8 @@ data_stream_table_rotated = [
]
# The streaming algorithm incorrectly includes a header in the result.
# Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_table_rotated = data_stream_table_rotated[1:]
# Trimming the table for the test of network, which doesn't include it.
data_network_table_rotated = data_stream_table_rotated[1:]
data_stream_two_tables_1 = [
[
@ -1298,8 +1298,8 @@ data_stream_two_tables_1 = [
]
# The streaming algorithm incorrectly includes a header and a footer.
# Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_1 = data_stream_two_tables_1[3:-1]
# Trimming the table for the test of network, which doesn't include it.
data_network_two_tables_1 = data_stream_two_tables_1[3:-1]
data_stream_two_tables_2 = [
["Table 325. Arrests by Race: 2009", "", "", "", "", ""],
@ -1609,7 +1609,7 @@ data_stream_two_tables_2 = [
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
]
data_hybrid_two_tables_b_1 = [
data_network_two_tables_b_1 = [
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
["Vgvhgh", "Hj", "Hj", "Hj"],
["Hj", "Hj", "Hj", "Hj"],
@ -1619,17 +1619,17 @@ data_hybrid_two_tables_b_1 = [
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
]
data_hybrid_two_tables_b_2 = [
data_network_two_tables_b_2 = [
["Trtrt", "H", "Gh"],
["Gh", "V", "Hv"],
["Hv", "Bhjb", "hg"],
]
# The streaming algorithm incorrectly includes a header and a footer.
# Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
# Trimming the table for the test of network, which doesn't include it.
data_network_two_tables_2 = data_stream_two_tables_2[3:-1]
data_hybrid_vertical_headers = [
data_network_vertical_headers = [
[
"",
"",
@ -2090,8 +2090,8 @@ data_stream_table_areas = [
["(each day of the payroll period)", ""],
]
# Hybrid doesn't recognize the footer as belonging to the table.
data_hybrid_table_regions = data_stream_table_areas[:-1]
# Network doesn't recognize the footer as belonging to the table.
data_network_table_regions = data_stream_table_areas[:-1]
data_stream_columns = [
[
@ -2613,9 +2613,9 @@ data_stream_split_text = [
]
# The stream algorithm excludes the string "Alphabetic Listing by type"
data_hybrid_split_text = []
data_hybrid_split_text.extend(data_stream_split_text)
data_hybrid_split_text[0] = [
data_network_split_text = []
data_network_split_text.extend(data_stream_split_text)
data_network_split_text[0] = [
'FEB', 'RUAR', 'Y 2014 M27 (BUS)', '',
'ALPHABETIC LISTING BY T', 'YPE', '', '', '', 'ABLPDM27'
]
@ -2851,15 +2851,15 @@ data_stream_flag_size = [
],
]
# Hybrid adds more content into the header.
data_hybrid_flag_size = [
# Network adds more content into the header.
data_network_flag_size = [
['', '', '', '', '(As at end-March)', '', '', '', '', '', ''],
['', '', '', '', '', '', '', '', '', '', '(` Billion)']
]
data_hybrid_flag_size.extend(data_stream_flag_size)
data_network_flag_size.extend(data_stream_flag_size)
data_hybrid_strip_text = [
data_network_strip_text = [
["VinsauVerre", ""],
["LesBlancs", "12.5CL"],
["A.O.PCôtesduRhône", ""],
@ -2900,7 +2900,7 @@ data_hybrid_strip_text = [
]
# Stream only detects part of the table
data_stream_strip_text = data_hybrid_strip_text[0:-13]
data_stream_strip_text = data_network_strip_text[0:-13]
data_stream_edge_tol = [
["Key figures", ""],
@ -2940,9 +2940,9 @@ data_stream_edge_tol = [
["period.", ""],
]
# The stream algorithm ends up including a footer, which hybrid correctly
# The stream algorithm ends up including a footer, which network correctly
# skips.
data_hybrid_edge_tol = data_stream_edge_tol[:-3]
data_network_edge_tol = data_stream_edge_tol[:-3]
data_lattice = [
[

View File

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

Before

Width:  |  Height:  |  Size: 48 KiB

After

Width:  |  Height:  |  Size: 48 KiB

View File

Before

Width:  |  Height:  |  Size: 88 KiB

After

Width:  |  Height:  |  Size: 88 KiB

View File

Before

Width:  |  Height:  |  Size: 102 KiB

After

Width:  |  Height:  |  Size: 102 KiB

View File

@ -72,22 +72,22 @@ def test_cli_stream():
assert format_error in result.output
def test_cli_hybrid():
def test_cli_network():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "budget.pdf")
outfile = os.path.join(tempdir, "budget.csv")
runner = CliRunner()
result = runner.invoke(
cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
cli, ["--format", "csv", "--output", outfile, "network", infile]
)
assert result.exit_code == 0
assert result.output == "Found 1 tables\n"
result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
result = runner.invoke(cli, ["--format", "csv", "network", infile])
output_error = "Error: Please specify output file path using --output"
assert output_error in result.output
result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
result = runner.invoke(cli, ["--output", outfile, "network", infile])
format_error = "Please specify output file format using --format"
assert format_error in result.output

View File

@ -150,32 +150,32 @@ def test_stream_layout_kwargs():
assert_frame_equal(df, tables[0].df)
def test_hybrid():
def test_network():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_rotated():
df = pd.DataFrame(data_hybrid_table_rotated)
def test_network_table_rotated():
df = pd.DataFrame(data_network_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables_a():
df1 = pd.DataFrame(data_hybrid_two_tables_1)
df2 = pd.DataFrame(data_hybrid_two_tables_2)
def test_network_two_tables_a():
df1 = pd.DataFrame(data_network_two_tables_1)
df2 = pd.DataFrame(data_network_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
@ -183,104 +183,104 @@ def test_hybrid_two_tables_a():
# Reported as https://github.com/camelot-dev/camelot/issues/132
def test_hybrid_two_tables_b():
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
def test_network_two_tables_b():
df1 = pd.DataFrame(data_network_two_tables_b_1)
df2 = pd.DataFrame(data_network_two_tables_b_2)
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_vertical_header():
def test_network_vertical_header():
"""Tests a complex table with a vertically text header.
"""
df = pd.DataFrame(data_hybrid_vertical_headers)
df = pd.DataFrame(data_network_vertical_headers)
filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_regions():
df = pd.DataFrame(data_hybrid_table_regions)
def test_network_table_regions():
df = pd.DataFrame(data_network_table_regions)
filename = os.path.join(testdir, "tabula/us-007.pdf")
# The "stream" test looks for a region in ["320,460,573,335"], which
# should exclude the header.
tables = camelot.read_pdf(
filename, flavor="hybrid", table_regions=["320,335,573,505"]
filename, flavor="network", table_regions=["320,335,573,505"]
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_areas():
def test_network_table_areas():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", table_areas=["320,500,573,335"]
filename, flavor="network", table_areas=["320,500,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_columns():
def test_network_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
filename, flavor="network", columns=["67,180,230,425,475"], row_tol=10
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_split_text():
df = pd.DataFrame(data_hybrid_split_text)
def test_network_split_text():
df = pd.DataFrame(data_network_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="hybrid",
flavor="network",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_flag_size():
df = pd.DataFrame(data_hybrid_flag_size)
def test_network_flag_size():
df = pd.DataFrame(data_network_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
tables = camelot.read_pdf(filename, flavor="network", flag_size=True)
assert_frame_equal(df, tables[0].df)
def test_hybrid_strip_text():
df = pd.DataFrame(data_hybrid_strip_text)
def test_network_strip_text():
df = pd.DataFrame(data_network_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
tables = camelot.read_pdf(filename, flavor="network", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)
def test_hybrid_edge_tol():
df = pd.DataFrame(data_hybrid_edge_tol)
def test_network_edge_tol():
df = pd.DataFrame(data_network_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
tables = camelot.read_pdf(filename, flavor="network", edge_tol=500)
assert_frame_equal(df, tables[0].df)
def test_hybrid_layout_kwargs():
def test_network_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
filename, flavor="network", layout_kwargs={"detect_vertical": False}
)
assert_frame_equal(df, tables[0].df)

View File

@ -15,7 +15,7 @@ filename = os.path.join(testdir, 'foo.pdf')
def test_unknown_flavor():
message = ("Unknown flavor specified."
" Use either 'lattice', 'stream', or 'hybrid'")
" Use either 'lattice', 'stream', or 'network'")
with pytest.raises(NotImplementedError, match=message):
camelot.read_pdf(filename, flavor='chocolate')

View File

@ -62,9 +62,9 @@ def test_stream_grid_plot():
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_hybrid_grid_plot():
def test_network_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(filename, flavor="network")
return unit_test_stable_plot(tables[0], 'grid')
@ -86,9 +86,9 @@ def test_stream_contour_plot():
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_hybrid_contour_plot():
def test_network_contour_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='hybrid')
tables = camelot.read_pdf(filename, flavor='network')
return unit_test_stable_plot(tables[0], 'contour')
@ -118,18 +118,18 @@ def test_stream_textedge_plot():
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_hybrid_textedge_plot():
def test_network_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
tables = camelot.read_pdf(filename, debug=True, flavor='network')
return unit_test_stable_plot(tables[0], 'textedge')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_hybrid_table_regions_textedge_plot():
def test_network_table_regions_textedge_plot():
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, debug=True, flavor="hybrid",
filename, debug=True, flavor="network",
table_regions=["320,505,573,330"]
)
return unit_test_stable_plot(tables[0], 'textedge')
@ -137,10 +137,10 @@ def test_hybrid_table_regions_textedge_plot():
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True, tolerance=TOLERANCE)
def test_hybrid_table_areas_text_plot():
def test_network_table_areas_text_plot():
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, debug=True, flavor="hybrid",
filename, debug=True, flavor="network",
table_areas=["320,500,573,335"]
)
return unit_test_stable_plot(tables[0], 'text')