Plot improvements, address 132
Plot takes an optional axes parameter, allowing notebooks more flexibility. Header heuristic in hybrid won't include headers which span the entire table. Added unit test for issue #132 Fixes https://github.com/camelot-dev/camelot/issues/132pull/153/head
parent
dbaab66e43
commit
81de841ca0
|
|
@ -5,6 +5,7 @@ from __future__ import division
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import copy
|
import copy
|
||||||
|
import math
|
||||||
|
|
||||||
from .base import TextBaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import (
|
from ..core import (
|
||||||
|
|
@ -120,7 +121,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
|
|
||||||
|
# Accept textlines that cross columns boundaries, as long as they
|
||||||
|
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
|
||||||
|
# columns.
|
||||||
|
# This is to avoid picking unrelated paragraphs.
|
||||||
|
if max_spread <= min(
|
||||||
|
MAX_COL_SPREAD_IN_HEADER,
|
||||||
|
math.ceil(len(col_anchors) / 2)
|
||||||
|
):
|
||||||
# Combined, the elements we've identified don't cross more
|
# Combined, the elements we've identified don't cross more
|
||||||
# than the authorized number of columns.
|
# than the authorized number of columns.
|
||||||
# We're trying to avoid
|
# We're trying to avoid
|
||||||
|
|
|
||||||
|
|
@ -65,9 +65,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
to_pdf_scale : bool
|
to_pdf_scale : bool (optional)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img = table.get_pdf_image()
|
img = table.get_pdf_image()
|
||||||
|
|
@ -83,6 +83,7 @@ def draw_parse_constraints(table, ax):
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
ax : matplotlib.axes.Axes
|
ax : matplotlib.axes.Axes
|
||||||
|
|
||||||
|
|
@ -110,8 +111,33 @@ def draw_parse_constraints(table, ax):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_plot(table, ax=None, to_pdf_scale=True):
|
||||||
|
"""Initialize plot and draw common components
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
to_pdf_scale :
|
||||||
|
|
||||||
|
ax : matplotlib.axes.Axes
|
||||||
|
|
||||||
|
to_pdf_scale : bool (optional)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
ax : matplotlib.axes.Axes
|
||||||
|
"""
|
||||||
|
if ax is None:
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.add_subplot(111, aspect="equal")
|
||||||
|
draw_pdf(table, ax, to_pdf_scale)
|
||||||
|
draw_parse_constraints(table, ax)
|
||||||
|
return ax
|
||||||
|
|
||||||
|
|
||||||
class PlotMethods(object):
|
class PlotMethods(object):
|
||||||
def __call__(self, table, kind="text", filename=None):
|
def __call__(self, table, kind="text", filename=None, ax=None):
|
||||||
"""Plot elements found on PDF page based on kind
|
"""Plot elements found on PDF page based on kind
|
||||||
specified, useful for debugging and playing with different
|
specified, useful for debugging and playing with different
|
||||||
parameters to get the best output.
|
parameters to get the best output.
|
||||||
|
|
@ -144,26 +170,24 @@ class PlotMethods(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
plot_method = getattr(self, kind)
|
plot_method = getattr(self, kind)
|
||||||
return plot_method(table)
|
return plot_method(table, ax)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text(table):
|
def text(table, ax=None):
|
||||||
"""Generates a plot for all text elements present
|
"""Generates a plot for all text elements present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
|
|
@ -178,26 +202,24 @@ class PlotMethods(object):
|
||||||
)
|
)
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def grid(table):
|
def grid(table, ax=None):
|
||||||
"""Generates a plot for the detected table grids
|
"""Generates a plot for the detected table grids
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
for row in table.cells:
|
for row in table.cells:
|
||||||
for cell in row:
|
for cell in row:
|
||||||
if cell.left:
|
if cell.left:
|
||||||
|
|
@ -208,27 +230,25 @@ class PlotMethods(object):
|
||||||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||||
if cell.bottom:
|
if cell.bottom:
|
||||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def contour(table):
|
def contour(table, ax=None):
|
||||||
"""Generates a plot for all table boundaries present
|
"""Generates a plot for all table boundaries present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
_FOR_LATTICE = table.flavor == "lattice"
|
_FOR_LATTICE = table.flavor == "lattice"
|
||||||
draw_pdf(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
|
|
||||||
if _FOR_LATTICE:
|
if _FOR_LATTICE:
|
||||||
table_bbox = table._bbox_unscaled
|
table_bbox = table._bbox_unscaled
|
||||||
|
|
@ -260,25 +280,23 @@ class PlotMethods(object):
|
||||||
ys.extend([t[1], t[3]])
|
ys.extend([t[1], t[3]])
|
||||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def textedge(table):
|
def textedge(table, ax=None):
|
||||||
"""Generates a plot for relevant textedges.
|
"""Generates a plot for relevant textedges.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
xs, ys = [], []
|
xs, ys = [], []
|
||||||
for t in table._text:
|
for t in table._text:
|
||||||
xs.extend([t[0], t[2]])
|
xs.extend([t[0], t[2]])
|
||||||
|
|
@ -352,26 +370,24 @@ class PlotMethods(object):
|
||||||
else:
|
else:
|
||||||
for te in table._textedges:
|
for te in table._textedges:
|
||||||
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def joint(table):
|
def joint(table, ax=None):
|
||||||
"""Generates a plot for all line intersections present
|
"""Generates a plot for all line intersections present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax, to_pdf_scale=False)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax, to_pdf_scale=False)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
table_bbox = table._bbox_unscaled
|
table_bbox = table._bbox_unscaled
|
||||||
x_coord = []
|
x_coord = []
|
||||||
y_coord = []
|
y_coord = []
|
||||||
|
|
@ -380,53 +396,48 @@ class PlotMethods(object):
|
||||||
x_coord.append(coord[0])
|
x_coord.append(coord[0])
|
||||||
y_coord.append(coord[1])
|
y_coord.append(coord[1])
|
||||||
ax.plot(x_coord, y_coord, "ro")
|
ax.plot(x_coord, y_coord, "ro")
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def line(table):
|
def line(table, ax=None):
|
||||||
"""Generates a plot for all line segments present
|
"""Generates a plot for all line segments present
|
||||||
on the PDF page.
|
on the PDF page.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
vertical, horizontal = table._segments
|
vertical, horizontal = table._segments
|
||||||
for v in vertical:
|
for v in vertical:
|
||||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||||
for h in horizontal:
|
for h in horizontal:
|
||||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def hybrid_table_search(table):
|
def hybrid_table_search(table, ax=None):
|
||||||
"""Generates a plot illustrating the steps of the hybrid table search.
|
"""Generates a plot illustrating the steps of the hybrid table search.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
table : camelot.core.Table
|
table : camelot.core.Table
|
||||||
|
ax : matplotlib.axes.Axes (optional)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
fig : matplotlib.fig.Figure
|
fig : matplotlib.fig.Figure
|
||||||
|
|
||||||
"""
|
"""
|
||||||
fig = plt.figure()
|
ax = prepare_plot(table, ax)
|
||||||
ax = fig.add_subplot(111, aspect="equal")
|
|
||||||
draw_pdf(table, ax)
|
|
||||||
draw_parse_constraints(table, ax)
|
|
||||||
|
|
||||||
if table.parse_details is None:
|
if table.parse_details is None:
|
||||||
return fig
|
return ax.get_figure()
|
||||||
parse_details = table.parse_details
|
parse_details = table.parse_details
|
||||||
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
||||||
max_h_gap = bbox_search["max_h_gap"]
|
max_h_gap = bbox_search["max_h_gap"]
|
||||||
|
|
@ -476,4 +487,4 @@ class PlotMethods(object):
|
||||||
label_pos="bottom,left"
|
label_pos="bottom,left"
|
||||||
)
|
)
|
||||||
|
|
||||||
return fig
|
return ax.get_figure()
|
||||||
|
|
|
||||||
|
|
@ -1609,6 +1609,22 @@ data_stream_two_tables_2 = [
|
||||||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
data_hybrid_two_tables_b_1 = [
|
||||||
|
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
|
||||||
|
["Vgvhgh", "Hj", "Hj", "Hj"],
|
||||||
|
["Hj", "Hj", "Hj", "Hj"],
|
||||||
|
["Hj", "Hj", "J", "Hj"],
|
||||||
|
["V", "C", "D", "Gfhj"],
|
||||||
|
["Hjb", "B", "Jhbh", "Hj"],
|
||||||
|
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
|
||||||
|
]
|
||||||
|
|
||||||
|
data_hybrid_two_tables_b_2 = [
|
||||||
|
["Trtrt", "H", "Gh"],
|
||||||
|
["Gh", "V", "Hv"],
|
||||||
|
["Hv", "Bhjb", "hg"],
|
||||||
|
]
|
||||||
|
|
||||||
# The streaming algorithm incorrectly includes a header and a footer.
|
# The streaming algorithm incorrectly includes a header and a footer.
|
||||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
# Trimming the table for the test of hybrid, which doesn't include it.
|
||||||
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -175,7 +175,7 @@ def test_hybrid_table_rotated():
|
||||||
assert_frame_equal(df, tables[0].df)
|
assert_frame_equal(df, tables[0].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_two_tables():
|
def test_hybrid_two_tables_a():
|
||||||
df1 = pd.DataFrame(data_hybrid_two_tables_1)
|
df1 = pd.DataFrame(data_hybrid_two_tables_1)
|
||||||
df2 = pd.DataFrame(data_hybrid_two_tables_2)
|
df2 = pd.DataFrame(data_hybrid_two_tables_2)
|
||||||
|
|
||||||
|
|
@ -187,6 +187,19 @@ def test_hybrid_two_tables():
|
||||||
assert df2.equals(tables[1].df)
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
|
# Reported as https://github.com/camelot-dev/camelot/issues/132
|
||||||
|
def test_hybrid_two_tables_b():
|
||||||
|
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
|
||||||
|
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
|
||||||
|
|
||||||
|
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
|
||||||
|
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||||
|
|
||||||
|
assert len(tables) == 2
|
||||||
|
assert df1.equals(tables[0].df)
|
||||||
|
assert df2.equals(tables[1].df)
|
||||||
|
|
||||||
|
|
||||||
def test_hybrid_table_regions():
|
def test_hybrid_table_regions():
|
||||||
df = pd.DataFrame(data_hybrid_table_regions)
|
df = pd.DataFrame(data_hybrid_table_regions)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue