Plot improvements, address 132

Plot takes an optional axes parameter, allowing notebooks more
flexibility.
Header heuristic in hybrid won't include headers which span the
entire table.
Added unit test for issue #132

Fixes https://github.com/camelot-dev/camelot/issues/132
pull/153/head
Frh 2020-04-25 20:51:00 -07:00
parent dbaab66e43
commit 81de841ca0
5 changed files with 99 additions and 50 deletions

View File

@ -5,6 +5,7 @@ from __future__ import division
import numpy as np import numpy as np
import copy import copy
import math
from .base import TextBaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
@ -120,7 +121,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
) )
) )
) )
if max_spread <= MAX_COL_SPREAD_IN_HEADER:
# Accept textlines that cross columns boundaries, as long as they
# cross less than MAX_COL_SPREAD_IN_HEADER, and half the number of
# columns.
# This is to avoid picking unrelated paragraphs.
if max_spread <= min(
MAX_COL_SPREAD_IN_HEADER,
math.ceil(len(col_anchors) / 2)
):
# Combined, the elements we've identified don't cross more # Combined, the elements we've identified don't cross more
# than the authorized number of columns. # than the authorized number of columns.
# We're trying to avoid # We're trying to avoid

View File

@ -65,9 +65,9 @@ def draw_pdf(table, ax, to_pdf_scale=True):
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes ax : matplotlib.axes.Axes (optional)
to_pdf_scale : bool to_pdf_scale : bool (optional)
""" """
img = table.get_pdf_image() img = table.get_pdf_image()
@ -83,6 +83,7 @@ def draw_parse_constraints(table, ax):
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes ax : matplotlib.axes.Axes
@ -110,8 +111,33 @@ def draw_parse_constraints(table, ax):
) )
def prepare_plot(table, ax=None, to_pdf_scale=True):
"""Initialize plot and draw common components
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
to_pdf_scale :
ax : matplotlib.axes.Axes
to_pdf_scale : bool (optional)
Returns
-------
ax : matplotlib.axes.Axes
"""
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale)
draw_parse_constraints(table, ax)
return ax
class PlotMethods(object): class PlotMethods(object):
def __call__(self, table, kind="text", filename=None): def __call__(self, table, kind="text", filename=None, ax=None):
"""Plot elements found on PDF page based on kind """Plot elements found on PDF page based on kind
specified, useful for debugging and playing with different specified, useful for debugging and playing with different
parameters to get the best output. parameters to get the best output.
@ -144,26 +170,24 @@ class PlotMethods(object):
) )
plot_method = getattr(self, kind) plot_method = getattr(self, kind)
return plot_method(table) return plot_method(table, ax)
@staticmethod @staticmethod
def text(table): def text(table, ax=None):
"""Generates a plot for all text elements present """Generates a plot for all text elements present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
xs, ys = [], [] xs, ys = [], []
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
@ -178,26 +202,24 @@ class PlotMethods(object):
) )
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig return ax.get_figure()
@staticmethod @staticmethod
def grid(table): def grid(table, ax=None):
"""Generates a plot for the detected table grids """Generates a plot for the detected table grids
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
for row in table.cells: for row in table.cells:
for cell in row: for cell in row:
if cell.left: if cell.left:
@ -208,27 +230,25 @@ class PlotMethods(object):
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]]) ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
if cell.bottom: if cell.bottom:
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]]) ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
return fig return ax.get_figure()
@staticmethod @staticmethod
def contour(table): def contour(table, ax=None):
"""Generates a plot for all table boundaries present """Generates a plot for all table boundaries present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
_FOR_LATTICE = table.flavor == "lattice" _FOR_LATTICE = table.flavor == "lattice"
draw_pdf(table, ax, to_pdf_scale=not _FOR_LATTICE) ax = prepare_plot(table, ax, to_pdf_scale=not _FOR_LATTICE)
draw_parse_constraints(table, ax)
if _FOR_LATTICE: if _FOR_LATTICE:
table_bbox = table._bbox_unscaled table_bbox = table._bbox_unscaled
@ -260,25 +280,23 @@ class PlotMethods(object):
ys.extend([t[1], t[3]]) ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10) ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
return fig return ax.get_figure()
@staticmethod @staticmethod
def textedge(table): def textedge(table, ax=None):
"""Generates a plot for relevant textedges. """Generates a plot for relevant textedges.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
xs, ys = [], [] xs, ys = [], []
for t in table._text: for t in table._text:
xs.extend([t[0], t[2]]) xs.extend([t[0], t[2]])
@ -352,26 +370,24 @@ class PlotMethods(object):
else: else:
for te in table._textedges: for te in table._textedges:
ax.plot([te.coord, te.coord], [te.y0, te.y1]) ax.plot([te.coord, te.coord], [te.y0, te.y1])
return fig return ax.get_figure()
@staticmethod @staticmethod
def joint(table): def joint(table, ax=None):
"""Generates a plot for all line intersections present """Generates a plot for all line intersections present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax, to_pdf_scale=False)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax, to_pdf_scale=False)
draw_parse_constraints(table, ax)
table_bbox = table._bbox_unscaled table_bbox = table._bbox_unscaled
x_coord = [] x_coord = []
y_coord = [] y_coord = []
@ -380,53 +396,48 @@ class PlotMethods(object):
x_coord.append(coord[0]) x_coord.append(coord[0])
y_coord.append(coord[1]) y_coord.append(coord[1])
ax.plot(x_coord, y_coord, "ro") ax.plot(x_coord, y_coord, "ro")
return fig return ax.get_figure()
@staticmethod @staticmethod
def line(table): def line(table, ax=None):
"""Generates a plot for all line segments present """Generates a plot for all line segments present
on the PDF page. on the PDF page.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
vertical, horizontal = table._segments vertical, horizontal = table._segments
for v in vertical: for v in vertical:
ax.plot([v[0], v[2]], [v[1], v[3]]) ax.plot([v[0], v[2]], [v[1], v[3]])
for h in horizontal: for h in horizontal:
ax.plot([h[0], h[2]], [h[1], h[3]]) ax.plot([h[0], h[2]], [h[1], h[3]])
return fig return ax.get_figure()
@staticmethod @staticmethod
def hybrid_table_search(table): def hybrid_table_search(table, ax=None):
"""Generates a plot illustrating the steps of the hybrid table search. """Generates a plot illustrating the steps of the hybrid table search.
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
Returns Returns
------- -------
fig : matplotlib.fig.Figure fig : matplotlib.fig.Figure
""" """
fig = plt.figure() ax = prepare_plot(table, ax)
ax = fig.add_subplot(111, aspect="equal")
draw_pdf(table, ax)
draw_parse_constraints(table, ax)
if table.parse_details is None: if table.parse_details is None:
return fig return ax.get_figure()
parse_details = table.parse_details parse_details = table.parse_details
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]): for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
max_h_gap = bbox_search["max_h_gap"] max_h_gap = bbox_search["max_h_gap"]
@ -476,4 +487,4 @@ class PlotMethods(object):
label_pos="bottom,left" label_pos="bottom,left"
) )
return fig return ax.get_figure()

View File

@ -1609,6 +1609,22 @@ data_stream_two_tables_2 = [
["1 Except forcible rape and prostitution.", "", "", "", "", ""], ["1 Except forcible rape and prostitution.", "", "", "", "", ""],
] ]
data_hybrid_two_tables_b_1 = [
["1", "Ghfhbdhj", "1", "Hgfdhgjsdhjdsf"],
["Vgvhgh", "Hj", "Hj", "Hj"],
["Hj", "Hj", "Hj", "Hj"],
["Hj", "Hj", "J", "Hj"],
["V", "C", "D", "Gfhj"],
["Hjb", "B", "Jhbh", "Hj"],
["Hjdhshj", "Hjhjhh", "Ddnj", "dsxv"],
]
data_hybrid_two_tables_b_2 = [
["Trtrt", "H", "Gh"],
["Gh", "V", "Hv"],
["Hv", "Bhjb", "hg"],
]
# The streaming algorithm incorrectly includes a header and a footer. # The streaming algorithm incorrectly includes a header and a footer.
# Trimming the table for the test of hybrid, which doesn't include it. # Trimming the table for the test of hybrid, which doesn't include it.
data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1] data_hybrid_two_tables_2 = data_stream_two_tables_2[3:-1]

View File

@ -175,7 +175,7 @@ def test_hybrid_table_rotated():
assert_frame_equal(df, tables[0].df) assert_frame_equal(df, tables[0].df)
def test_hybrid_two_tables(): def test_hybrid_two_tables_a():
df1 = pd.DataFrame(data_hybrid_two_tables_1) df1 = pd.DataFrame(data_hybrid_two_tables_1)
df2 = pd.DataFrame(data_hybrid_two_tables_2) df2 = pd.DataFrame(data_hybrid_two_tables_2)
@ -187,6 +187,19 @@ def test_hybrid_two_tables():
assert df2.equals(tables[1].df) assert df2.equals(tables[1].df)
# Reported as https://github.com/camelot-dev/camelot/issues/132
def test_hybrid_two_tables_b():
df1 = pd.DataFrame(data_hybrid_two_tables_b_1)
df2 = pd.DataFrame(data_hybrid_two_tables_b_2)
filename = os.path.join(testdir, "camelot-issue-132-multiple-tables.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
assert len(tables) == 2
assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
def test_hybrid_table_regions(): def test_hybrid_table_regions():
df = pd.DataFrame(data_hybrid_table_regions) df = pd.DataFrame(data_hybrid_table_regions)