Plot improvements, address 132

Plot takes an optional axes parameter, allowing notebooks more
flexibility.
Header heuristic in hybrid won't include headers which span the
entire table.
Added unit test for issue #132

Fixes https://github.com/camelot-dev/camelot/issues/132
pull/153/head
Frh 2020-04-25 20:51:00 -07:00
parent 84ec5c6acd
commit 016776939e
5 changed files with 99 additions and 50 deletions

View File

@ -5,6 +5,7 @@ from __future__ import division
import numpy as np
import copy
import math
from .base import TextBaseParser
from ..core import (
@ -120,7 +121,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
)
)
)
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
# Accept textlines that cross columns boundaries, as long as they
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
# columns.
# This is to avoid picking unrelated paragraphs.
if max_spread <= min(
MAX_COL_SPREAD_IN_HEADER,
math.ceil(len(col_anchors) / 2)
):
# Combined, the elements we've identified don't cross more
# than the authorized number of columns.
# We're trying to avoid

View File

@ -65,9 +65,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes
ax : matplotlib.axes.Axes (optional)
to_pdf_scale : bool
to_pdf_scale : bool (optional)
"""
img = table.get_pdf_image()
@ -83,6 +83,7 @@ def draw_parse_constraints(table, ax):
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
@ -110,8 +111,33 @@ def draw_parse_constraints(table, ax):
)
def prepare_plot(table, ax=None, to_pdf_scale=True):
"""Initialize plot and draw common components
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
to_pdf_scale :
ax : matplotlib.axes.Axes
to_pdf_scale : bool (optional)
Returns
-------
ax : matplotlib.axes.Axes
"""
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale)
draw_parse_constraints(table, ax)
return ax
class PlotMethods(object):
def __call__(self, table, kind="text", filename=None):
def __call__(self, table, kind="text", filename=None, ax=None):
"""Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different
parameters to get the best output.
@ -144,26 +170,24 @@ class PlotMethods(object):
)
plot_method = getattr(self, kind)
return plot_method(table)
return plot_method(table, ax)
@staticmethod
def text(table):
def text(table, ax=None):
"""Generates a plot for all text elements present
on the PDF page.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax)
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
@ -178,26 +202,24 @@ class PlotMethods(object):
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig
return ax.get_figure()
@staticmethod
def grid(table):
def grid(table, ax=None):
"""Generates a plot for the detected table grids
on the PDF page.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax)
for row in table.cells:
for cell in row:
if cell.left:
@ -208,27 +230,25 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
return fig
return ax.get_figure()
@staticmethod
def contour(table):
def contour(table, ax=None):
"""Generates a plot for all table boundaries present
on the PDF page.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
_FOR_LATTICE = table.flavor == "lattice"
draw_pdf(table, ax, to_pdf_scale=not _FOR_LATTICE)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
if _FOR_LATTICE:
table_bbox = table._bbox_unscaled
@ -260,25 +280,23 @@ class PlotMethods(object):
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig
return ax.get_figure()
@staticmethod
def textedge(table):
def textedge(table, ax=None):
"""Generates a plot for relevant textedges.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax)
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
@ -352,26 +370,24 @@ class PlotMethods(object):
else:
for te in table._textedges:
ax.plot([te.coord, te.coord], [te.y0, te.y1])
return fig
return ax.get_figure()
@staticmethod
def joint(table):
def joint(table, ax=None):
"""Generates a plot for all line intersections present
on the PDF page.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale=False)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax, to_pdf_scale=False)
table_bbox = table._bbox_unscaled
x_coord = []
y_coord = []
@ -380,53 +396,48 @@ class PlotMethods(object):
x_coord.append(coord[0])
y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro")
return fig
return ax.get_figure()
@staticmethod
def line(table):
def line(table, ax=None):
"""Generates a plot for all line segments present
on the PDF page.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax)
vertical, horizontal = table._segments
for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]])
return fig
return ax.get_figure()
@staticmethod
def hybrid_table_search(table):
def hybrid_table_search(table, ax=None):
"""Generates a plot illustrating the steps of the hybrid table search.
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns
-------
fig : matplotlib.fig.Figure
"""
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
ax = prepare_plot(table, ax)
if table.parse_details is None:
return fig
return ax.get_figure()
parse_details = table.parse_details
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
max_h_gap = bbox_search["max_h_gap"]
@ -476,4 +487,4 @@ class PlotMethods(object):
label_pos="bottom,left"
)
return fig
return ax.get_figure()

View File

@ -1609,6 +1609,22 @@ data_stream_two_tables_2 = [
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
]
data_hybrid_two_tables_b_1 = [
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
["Vgvhgh", "Hj", "Hj", "Hj"],
["Hj", "Hj", "Hj", "Hj"],
["Hj", "Hj", "J", "Hj"],
["V", "C", "D", "Gfhj"],
["Hjb", "B", "Jhbh", "Hj"],
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
]
data_hybrid_two_tables_b_2 = [
["Trtrt", "H", "Gh"],
["Gh", "V", "Hv"],
["Hv", "Bhjb", "hg"],
]
# The streaming algorithm incorrectly includes a header and a footer.
# Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]

View File

@ -169,7 +169,7 @@ def test_hybrid_table_rotated():
assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables():
def test_hybrid_two_tables_a():
df1 = pd.DataFrame(data_hybrid_two_tables_1)
df2 = pd.DataFrame(data_hybrid_two_tables_2)
@ -181,6 +181,19 @@ def test_hybrid_two_tables():
assert df2.equals(tables[1].df)
# Reported as https://github.com/camelot-dev/camelot/issues/132
def test_hybrid_two_tables_b():
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_table_regions():
df = pd.DataFrame(data_hybrid_table_regions)