Plot improvements, address 132
Plot takes an optional axes parameter, allowing notebooks more flexibility. Header heuristic in hybrid won't include headers which span the entire table. Added unit test for issue #132 Fixes https://github.com/camelot-dev/camelot/issues/132pull/153/head
parent
dbaab66e43
commit
81de841ca0
|
|
@ -5,6 +5,7 @@ from __future__ import division
|
|||
|
||||
import numpy as np
|
||||
import copy
|
||||
import math
|
||||
|
||||
from .base import TextBaseParser
|
||||
from ..core import (
|
||||
|
|
@ -120,7 +121,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
|||
)
|
||||
)
|
||||
)
|
||||
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
|
||||
|
||||
# Accept textlines that cross columns boundaries, as long as they
|
||||
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
|
||||
# columns.
|
||||
# This is to avoid picking unrelated paragraphs.
|
||||
if max_spread <= min(
|
||||
MAX_COL_SPREAD_IN_HEADER,
|
||||
math.ceil(len(col_anchors) / 2)
|
||||
):
|
||||
# Combined, the elements we've identified don't cross more
|
||||
# than the authorized number of columns.
|
||||
# We're trying to avoid
|
||||
|
|
|
|||
|
|
@ -65,9 +65,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
|
|||
----------
|
||||
table : camelot.core.Table
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
to_pdf_scale : bool
|
||||
to_pdf_scale : bool (optional)
|
||||
|
||||
"""
|
||||
img = table.get_pdf_image()
|
||||
|
|
@ -83,6 +83,7 @@ def draw_parse_constraints(table, ax):
|
|||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
|
||||
|
|
@ -110,8 +111,33 @@ def draw_parse_constraints(table, ax):
|
|||
)
|
||||
|
||||
|
||||
def prepare_plot(table, ax=None, to_pdf_scale=True):
|
||||
"""Initialize plot and draw common components
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
to_pdf_scale :
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
|
||||
to_pdf_scale : bool (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
ax : matplotlib.axes.Axes
|
||||
"""
|
||||
if ax is None:
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax, to_pdf_scale)
|
||||
draw_parse_constraints(table, ax)
|
||||
return ax
|
||||
|
||||
|
||||
class PlotMethods(object):
|
||||
def __call__(self, table, kind="text", filename=None):
|
||||
def __call__(self, table, kind="text", filename=None, ax=None):
|
||||
"""Plot elements found on PDF page based on kind
|
||||
specified, useful for debugging and playing with different
|
||||
parameters to get the best output.
|
||||
|
|
@ -144,26 +170,24 @@ class PlotMethods(object):
|
|||
)
|
||||
|
||||
plot_method = getattr(self, kind)
|
||||
return plot_method(table)
|
||||
return plot_method(table, ax)
|
||||
|
||||
@staticmethod
|
||||
def text(table):
|
||||
def text(table, ax=None):
|
||||
"""Generates a plot for all text elements present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax)
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
|
|
@ -178,26 +202,24 @@ class PlotMethods(object):
|
|||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def grid(table):
|
||||
def grid(table, ax=None):
|
||||
"""Generates a plot for the detected table grids
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax)
|
||||
for row in table.cells:
|
||||
for cell in row:
|
||||
if cell.left:
|
||||
|
|
@ -208,27 +230,25 @@ class PlotMethods(object):
|
|||
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
||||
if cell.bottom:
|
||||
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def contour(table):
|
||||
def contour(table, ax=None):
|
||||
"""Generates a plot for all table boundaries present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
_FOR_LATTICE = table.flavor == "lattice"
|
||||
draw_pdf(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
|
||||
|
||||
if _FOR_LATTICE:
|
||||
table_bbox = table._bbox_unscaled
|
||||
|
|
@ -260,25 +280,23 @@ class PlotMethods(object):
|
|||
ys.extend([t[1], t[3]])
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def textedge(table):
|
||||
def textedge(table, ax=None):
|
||||
"""Generates a plot for relevant textedges.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax)
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
|
|
@ -352,26 +370,24 @@ class PlotMethods(object):
|
|||
else:
|
||||
for te in table._textedges:
|
||||
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def joint(table):
|
||||
def joint(table, ax=None):
|
||||
"""Generates a plot for all line intersections present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax, to_pdf_scale=False)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax, to_pdf_scale=False)
|
||||
table_bbox = table._bbox_unscaled
|
||||
x_coord = []
|
||||
y_coord = []
|
||||
|
|
@ -380,53 +396,48 @@ class PlotMethods(object):
|
|||
x_coord.append(coord[0])
|
||||
y_coord.append(coord[1])
|
||||
ax.plot(x_coord, y_coord, "ro")
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def line(table):
|
||||
def line(table, ax=None):
|
||||
"""Generates a plot for all line segments present
|
||||
on the PDF page.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
ax = prepare_plot(table, ax)
|
||||
vertical, horizontal = table._segments
|
||||
for v in vertical:
|
||||
ax.plot([v[0], v[2]], [v[1], v[3]])
|
||||
for h in horizontal:
|
||||
ax.plot([h[0], h[2]], [h[1], h[3]])
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
def hybrid_table_search(table):
|
||||
def hybrid_table_search(table, ax=None):
|
||||
"""Generates a plot illustrating the steps of the hybrid table search.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
Returns
|
||||
-------
|
||||
fig : matplotlib.fig.Figure
|
||||
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, aspect="equal")
|
||||
draw_pdf(table, ax)
|
||||
draw_parse_constraints(table, ax)
|
||||
|
||||
ax = prepare_plot(table, ax)
|
||||
if table.parse_details is None:
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
parse_details = table.parse_details
|
||||
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
||||
max_h_gap = bbox_search["max_h_gap"]
|
||||
|
|
@ -476,4 +487,4 @@ class PlotMethods(object):
|
|||
label_pos="bottom,left"
|
||||
)
|
||||
|
||||
return fig
|
||||
return ax.get_figure()
|
||||
|
|
|
|||
|
|
@ -1609,6 +1609,22 @@ data_stream_two_tables_2 = [
|
|||
["1 Except forcible rape and prostitution.", "", "", "", "", ""],
|
||||
]
|
||||
|
||||
data_hybrid_two_tables_b_1 = [
|
||||
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
|
||||
["Vgvhgh", "Hj", "Hj", "Hj"],
|
||||
["Hj", "Hj", "Hj", "Hj"],
|
||||
["Hj", "Hj", "J", "Hj"],
|
||||
["V", "C", "D", "Gfhj"],
|
||||
["Hjb", "B", "Jhbh", "Hj"],
|
||||
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
|
||||
]
|
||||
|
||||
data_hybrid_two_tables_b_2 = [
|
||||
["Trtrt", "H", "Gh"],
|
||||
["Gh", "V", "Hv"],
|
||||
["Hv", "Bhjb", "hg"],
|
||||
]
|
||||
|
||||
# The streaming algorithm incorrectly includes a header and a footer.
|
||||
# Trimming the table for the test of hybrid, which doesn't include it.
|
||||
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -175,7 +175,7 @@ def test_hybrid_table_rotated():
|
|||
assert_frame_equal(df, tables[0].df)
|
||||
|
||||
|
||||
def test_hybrid_two_tables():
|
||||
def test_hybrid_two_tables_a():
|
||||
df1 = pd.DataFrame(data_hybrid_two_tables_1)
|
||||
df2 = pd.DataFrame(data_hybrid_two_tables_2)
|
||||
|
||||
|
|
@ -187,6 +187,19 @@ def test_hybrid_two_tables():
|
|||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
# Reported as https://github.com/camelot-dev/camelot/issues/132
|
||||
def test_hybrid_two_tables_b():
|
||||
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
|
||||
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
|
||||
|
||||
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
|
||||
tables = camelot.read_pdf(filename, flavor="hybrid")
|
||||
|
||||
assert len(tables) == 2
|
||||
assert df1.equals(tables[0].df)
|
||||
assert df2.equals(tables[1].df)
|
||||
|
||||
|
||||
def test_hybrid_table_regions():
|
||||
df = pd.DataFrame(data_hybrid_table_regions)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue