Initial Hybrid parser, for now identical to Stream

pull/153/head
Frh 2020-04-19 16:27:01 -07:00
parent 58823e57e9
commit d520a77bb7
15 changed files with 726 additions and 19 deletions

View File

@ -31,7 +31,8 @@ pass_config = click.make_pass_decorator(Config)
@click.group(name="camelot")
@click.version_option(version=__version__)
@click.option("-q", "--quiet", is_flag=False, help="Suppress logs and warnings.")
@click.option("-q", "--quiet", is_flag=False,
help="Suppress logs and warnings.")
@click.option(
"-p",
"--pages",
@ -98,7 +99,8 @@ def cli(ctx, *args, **kwargs):
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-back", "--process_background", is_flag=True, help="Process background lines."
"-back", "--process_background", is_flag=True,
help="Process background lines."
)
@click.option(
"-scale",
@ -127,7 +129,8 @@ def cli(ctx, *args, **kwargs):
"-l",
"--line_tol",
default=2,
help="Tolerance parameter used to merge close vertical" " and horizontal lines.",
help="Tolerance parameter used to merge close vertical"
" and horizontal lines.",
)
@click.option(
"-j",
@ -197,12 +200,15 @@ def lattice(c, *args, **kwargs):
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
raise click.UsageError(
"Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")
raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet, **kwargs
filepath, pages=pages, flavor="lattice", suppress_stdout=quiet,
**kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:
@ -247,7 +253,8 @@ def lattice(c, *args, **kwargs):
"-r",
"--row_tol",
default=2,
help="Tolerance parameter" " used to combine text vertically, to generate rows.",
help="Tolerance parameter"
" used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
@ -288,9 +295,11 @@ def stream(c, *args, **kwargs):
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError("Please specify output file path using --output")
raise click.UsageError(
"Please specify output file path using --output")
if f is None:
raise click.UsageError("Please specify output file format using --format")
raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="stream", suppress_stdout=quiet, **kwargs
@ -302,3 +311,97 @@ def stream(c, *args, **kwargs):
plt.show()
else:
tables.export(output, f=f, compress=compress)
@cli.command("hybrid")
@click.option(
"-R",
"--table_regions",
default=[],
multiple=True,
help="Page regions to analyze. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-T",
"--table_areas",
default=[],
multiple=True,
help="Table areas to process. Example: x1,y1,x2,y2"
" where x1, y1 -> left-top and x2, y2 -> right-bottom.",
)
@click.option(
"-C",
"--columns",
default=[],
multiple=True,
help="X coordinates of column separators.",
)
@click.option(
"-e",
"--edge_tol",
default=50,
help="Tolerance parameter" " for extending textedges vertically.",
)
@click.option(
"-r",
"--row_tol",
default=2,
help="Tolerance parameter"
" used to combine text vertically, to generate rows.",
)
@click.option(
"-c",
"--column_tol",
default=0,
help="Tolerance parameter"
" used to combine text horizontally, to generate columns.",
)
@click.option(
"-plot",
"--plot_type",
type=click.Choice(["text", "grid", "contour", "textedge"]),
help="Plot elements found on PDF page for visual debugging.",
)
@click.argument("filepath", type=click.Path(exists=True))
@pass_config
def hybrid(c, *args, **kwargs):
"""Use spaces between text to parse the table."""
conf = c.config
pages = conf.pop("pages")
output = conf.pop("output")
f = conf.pop("format")
compress = conf.pop("zip")
quiet = conf.pop("quiet")
plot_type = kwargs.pop("plot_type")
filepath = kwargs.pop("filepath")
kwargs.update(conf)
table_regions = list(kwargs["table_regions"])
kwargs["table_regions"] = None if not table_regions else table_regions
table_areas = list(kwargs["table_areas"])
kwargs["table_areas"] = None if not table_areas else table_areas
columns = list(kwargs["columns"])
kwargs["columns"] = None if not columns else columns
if plot_type is not None:
if not _HAS_MPL:
raise ImportError("matplotlib is required for plotting.")
else:
if output is None:
raise click.UsageError(
"Please specify output file path using --output")
if f is None:
raise click.UsageError(
"Please specify output file format using --format")
tables = read_pdf(
filepath, pages=pages, flavor="hybrid", suppress_stdout=quiet, **kwargs
)
click.echo("Found {} tables".format(tables.n))
if plot_type is not None:
for table in tables:
plot(table, kind=plot_type)
plt.show()
else:
tables.export(output, f=f, compress=compress)

View File

@ -379,6 +379,8 @@ class Table(object):
self._image = None
self._image_path = None # Temporary file to hold an image of the pdf
self._text = [] # List of text box coordinates
def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
@ -432,11 +434,11 @@ class Table(object):
self.pdf_size = (parser.pdf_width, parser.pdf_height)
_text = []
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
_text.extend(
[(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
self._text = _text
def get_pdf_image(self):
"""Compute pdf image and cache it
"""

View File

@ -7,7 +7,7 @@ import logging
from PyPDF2 import PdfFileReader, PdfFileWriter
from .core import TableList
from .parsers import Stream, Lattice
from .parsers import Stream, Lattice, Hybrid
from .utils import (
build_file_path_in_temp_dir,
get_page_layout,
@ -21,7 +21,8 @@ logger = logging.getLogger("camelot")
PARSERS = {
"lattice": Lattice,
"stream": Stream
"stream": Stream,
"hybrid": Hybrid,
}
@ -173,7 +174,7 @@ class PDFHandler():
Parameters
----------
flavor : str (default: 'lattice')
The parsing method to use ('lattice' or 'stream').
The parsing method to use ('lattice', 'stream', or 'hybrid').
Lattice is used by default.
suppress_stdout : str (default: False)
Suppress logs and warnings.

View File

@ -99,9 +99,10 @@ def read_pdf(
"""
layout_kwargs = layout_kwargs or {}
if flavor not in ["lattice", "stream"]:
if flavor not in ["lattice", "stream", "hybrid"]:
raise NotImplementedError(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
"Unknown flavor specified."
" Use either 'lattice', 'stream', or 'hybrid'"
)
with warnings.catch_warnings():

View File

@ -2,3 +2,4 @@
from .stream import Stream
from .lattice import Lattice
from .hybrid import Hybrid

View File

@ -0,0 +1,441 @@
# -*- coding: utf-8 -*-
from __future__ import division
import warnings
import numpy as np
from .base import BaseParser
from ..core import TextEdges
from ..utils import (text_in_bbox, text_in_bbox_per_axis)
class Hybrid(BaseParser):
"""Hybrid method of parsing looks for spaces between text
to parse the table.
If you want to specify columns when specifying multiple table
areas, make sure that the length of both lists are equal.
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
columns : list, optional (default: None)
List of column x-coordinates strings where the coordinates
are comma-separated.
split_text : bool, optional (default: False)
Split text that spans across multiple cells.
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds <s></s> around flagged text.
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
column_tol : int, optional (default: 0)
Tolerance parameter used to combine text horizontally,
to generate columns.
"""
def __init__(
self,
table_regions=None,
table_areas=None,
columns=None,
flag_size=False,
split_text=False,
strip_text="",
edge_tol=50,
row_tol=2,
column_tol=0,
**kwargs
):
super().__init__(
"hybrid",
table_regions=table_regions,
table_areas=table_areas,
split_text=split_text,
strip_text=strip_text,
flag_size=flag_size,
)
self.columns = columns
self._validate_columns()
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
@staticmethod
def _text_bbox(t_bbox):
"""Returns bounding box for the text present on a page.
Parameters
----------
t_bbox : dict
Dict with two keys 'horizontal' and 'vertical' with lists of
LTTextLineHorizontals and LTTextLineVerticals respectively.
Returns
-------
text_bbox : tuple
Tuple (x0, y0, x1, y1) in pdf coordinate space.
"""
xmin = min(t.x0 for direction in t_bbox for t in t_bbox[direction])
ymin = min(t.y0 for direction in t_bbox for t in t_bbox[direction])
xmax = max(t.x1 for direction in t_bbox for t in t_bbox[direction])
ymax = max(t.y1 for direction in t_bbox for t in t_bbox[direction])
text_bbox = (xmin, ymin, xmax, ymax)
return text_bbox
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = None
rows = []
temp = []
non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]):
if row_y is None:
row_y = t.y0
elif not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = Hybrid._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
]
cols.extend(Hybrid._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm
described by Anssi Nurminen's master's thesis.
Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
Assumes that tables are situated relatively far apart
vertically.
"""
# TODO: add support for arabic text #141
# sort textlines in reading order
textlines.sort(key=lambda x: (-x.y0, x.x0))
textedges = TextEdges(edge_tol=self.edge_tol)
# generate left, middle and right textedges
textedges.generate(textlines)
# select relevant edges
relevant_textedges = textedges.get_relevant()
self.textedges.extend(relevant_textedges)
# guess table areas using textlines and relevant edges
table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
# treat whole page as table area if no table areas found
if not table_bbox:
table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
return table_bbox
def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is None:
hor_text = self.horizontal_text
if self.table_regions is not None:
# filter horizontal text
hor_text = []
for region in self.table_regions:
x1, y1, x2, y2 = region.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
region_text = text_in_bbox(
(x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
else:
table_bbox = {}
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, table_idx, tk):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
tk,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = \
self._text_bbox(self.t_bbox)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
table.record_parse_metadata(self)
# for plotting
table._bbox = self.table_bbox
table._segments = None
table._textedges = self.textedges
return table
def extract_tables(self, filename):
if self._document_has_no_text():
return []
# Identify plausible areas within the doc where tables lie,
# populate table_bbox keys with these areas.
self._generate_table_bbox()
_tables = []
# sort tables based on y-coord
for table_idx, bbox in enumerate(
sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
):
cols, rows = self._generate_columns_and_rows(table_idx, bbox)
table = self._generate_table(table_idx, cols, rows)
table._bbox = bbox
_tables.append(table)
return _tables

View File

@ -252,7 +252,6 @@ class Lattice(BaseParser):
table_bbox, vertical_segments, horizontal_segments, pdf_scalers
)
def _generate_columns_and_rows(self, tk):
# select elements which lie within table_bbox
v_s, h_s = segments_in_bbox(

View File

@ -37,7 +37,7 @@ class PlotMethods(object):
raise NotImplementedError(
"Lattice flavor does not support kind='{}'".format(kind)
)
elif table.flavor == "stream" and kind in ["line"]:
elif table.flavor in ["stream", "hybrid"] and kind in ["line"]:
raise NotImplementedError(
"Stream flavor does not support kind='{}'".format(kind)
)

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

View File

@ -72,6 +72,26 @@ def test_cli_stream():
assert format_error in result.output
def test_cli_hybrid():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "budget.pdf")
outfile = os.path.join(tempdir, "budget.csv")
runner = CliRunner()
result = runner.invoke(
cli, ["--format", "csv", "--output", outfile, "hybrid", infile]
)
assert result.exit_code == 0
assert result.output == "Found 1 tables\n"
result = runner.invoke(cli, ["--format", "csv", "hybrid", infile])
output_error = "Error: Please specify output file path using --output"
assert output_error in result.output
result = runner.invoke(cli, ["--output", outfile, "hybrid", infile])
format_error = "Please specify output file format using --format"
assert format_error in result.output
def test_cli_password():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "health_protected.pdf")

View File

@ -148,6 +148,115 @@ def test_stream_layout_kwargs():
assert_frame_equal(df, tables[0].df)
def test_hybrid():
df = pd.DataFrame(data_stream)
filename = os.path.join(testdir, "health.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_rotated():
df = pd.DataFrame(data_stream_table_rotated)
filename = os.path.join(testdir, "clockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df)
filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables():
df1 = pd.DataFrame(data_stream_two_tables_1)
df2 = pd.DataFrame(data_stream_two_tables_2)
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_table_regions():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", table_regions=["320,460,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_table_areas():
df = pd.DataFrame(data_stream_table_areas)
filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", table_areas=["320,500,573,335"]
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_columns():
df = pd.DataFrame(data_stream_columns)
filename = os.path.join(testdir, "mexican_towns.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", columns=["67,180,230,425,475"], row_tol=10
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_split_text():
df = pd.DataFrame(data_stream_split_text)
filename = os.path.join(testdir, "tabula/m27.pdf")
tables = camelot.read_pdf(
filename,
flavor="hybrid",
columns=["72,95,209,327,442,529,566,606,683"],
split_text=True,
)
assert_frame_equal(df, tables[0].df)
def test_hybrid_flag_size():
df = pd.DataFrame(data_stream_flag_size)
filename = os.path.join(testdir, "superscript.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", flag_size=True)
assert_frame_equal(df, tables[0].df)
def test_hybrid_strip_text():
df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", strip_text=" ,\n")
assert_frame_equal(df, tables[0].df)
def test_hybrid_edge_tol():
df = pd.DataFrame(data_stream_edge_tol)
filename = os.path.join(testdir, "edge_tol.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid", edge_tol=500)
assert_frame_equal(df, tables[0].df)
def test_hybrid_layout_kwargs():
df = pd.DataFrame(data_stream_layout_kwargs)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename, flavor="hybrid", layout_kwargs={"detect_vertical": False}
)
assert_frame_equal(df, tables[0].df)
def test_lattice():
df = pd.DataFrame(data_lattice)

View File

@ -55,6 +55,16 @@ def test_stream_grid_plot():
return camelot.plot(tables[0], kind='grid')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_hybrid_grid_plot():
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
return camelot.plot(tables[0], kind='grid')
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_lattice_contour_plot():
@ -73,6 +83,16 @@ def test_stream_contour_plot():
return camelot.plot(tables[0], kind='contour')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_hybrid_contour_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='hybrid')
return camelot.plot(tables[0], kind='contour')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
@ -97,7 +117,17 @@ def test_joint_plot():
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_textedge_plot():
def test_stream_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='stream')
return camelot.plot(tables[0], kind='textedge')
@pytest.mark.skipif(LEGACY_MATPLOTLIB,
reason="depends on a recent version of MatPlotLib")
@pytest.mark.mpl_image_compare(
baseline_dir="files/baseline_plots", remove_text=True)
def test_hybrid_textedge_plot():
filename = os.path.join(testdir, "tabula/12s0324.pdf")
tables = camelot.read_pdf(filename, flavor='hybrid')
return camelot.plot(tables[0], kind='textedge')