493 lines
14 KiB
Python
493 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
try:
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as patches
|
|
except ImportError:
|
|
_HAS_MPL = False
|
|
else:
|
|
_HAS_MPL = True
|
|
|
|
from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
|
|
|
|
from pdfminer.layout import (
|
|
LTTextLineVertical,
|
|
)
|
|
|
|
|
|
def extend_axe_lim(ax, bbox, margin=10):
|
|
"""Ensure the ax limits include the input bbox
|
|
"""
|
|
x0, x1 = ax.get_xlim()
|
|
y0, y1 = ax.get_ylim()
|
|
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
|
|
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
|
|
|
|
|
|
def draw_labeled_bbox(
|
|
ax, bbox, text,
|
|
color="black", linewidth=3,
|
|
linestyle="solid",
|
|
label_pos="top,left",
|
|
fontsize=12,
|
|
):
|
|
"""Utility drawing function to draw a box with an associated text label
|
|
"""
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(bbox[0], bbox[1]),
|
|
bbox[2] - bbox[0], bbox[3] - bbox[1],
|
|
color=color,
|
|
linewidth=linewidth, linestyle=linestyle,
|
|
fill=False
|
|
)
|
|
)
|
|
|
|
vlabel, hlabel = label_pos.split(",")
|
|
if vlabel == "top":
|
|
y = max(bbox[1], bbox[3])
|
|
elif vlabel == "bottom":
|
|
y = min(bbox[1], bbox[3])
|
|
else:
|
|
y = 0.5 * (bbox[1] + bbox[3])
|
|
|
|
# We want to draw the label outside the box (above or below)
|
|
label_align_swap = {
|
|
"top": "bottom",
|
|
"bottom": "top",
|
|
"center": "center"
|
|
}
|
|
vlabel_out_of_box = label_align_swap[vlabel]
|
|
if hlabel == "right":
|
|
x = max(bbox[0], bbox[2])
|
|
elif hlabel == "left":
|
|
x = min(bbox[0], bbox[2])
|
|
else:
|
|
x = 0.5 * (bbox[0] + bbox[2])
|
|
ax.text(
|
|
x, y,
|
|
text,
|
|
fontsize=fontsize, color="black",
|
|
verticalalignment=vlabel_out_of_box,
|
|
horizontalalignment=hlabel,
|
|
bbox=dict(facecolor=color, alpha=0.1)
|
|
)
|
|
|
|
|
|
def draw_pdf(table, ax):
|
|
"""Draw the content of the table's source pdf into the passed subplot
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
"""
|
|
img = table.get_pdf_image()
|
|
ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
|
|
|
|
|
|
def draw_parse_constraints(table, ax):
|
|
"""Draw any user provided constraints (area, region, columns, etc)
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
"""
|
|
if table.parse_details:
|
|
zone_constraints = {
|
|
"region": "table_regions",
|
|
"area": "table_areas",
|
|
}
|
|
for zone_name, zone_id in zone_constraints.items():
|
|
# Display a bbox per region / area
|
|
for zone_str in table.parse_details[zone_id] or []:
|
|
draw_labeled_bbox(
|
|
ax, bbox_from_str(zone_str),
|
|
"{zone_name}: ({zone_str})".format(
|
|
zone_name=zone_name,
|
|
zone_str=zone_str
|
|
),
|
|
color="purple",
|
|
linestyle="dotted",
|
|
linewidth=1,
|
|
label_pos="bottom,right"
|
|
)
|
|
|
|
|
|
def draw_text(table, ax):
|
|
"""Draw text, horizontal in blue, vertical in red
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
"""
|
|
bbox = bbox_from_textlines(table.textlines)
|
|
for t in table.textlines:
|
|
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(t.x0, t.y0),
|
|
t.x1 - t.x0,
|
|
t.y1 - t.y0,
|
|
color=color,
|
|
alpha=0.2
|
|
)
|
|
)
|
|
extend_axe_lim(ax, bbox)
|
|
|
|
|
|
def prepare_plot(table, ax=None):
|
|
"""Initialize plot and draw common components
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
ax : matplotlib.axes.Axes
|
|
"""
|
|
if ax is None:
|
|
fig = plt.figure()
|
|
ax = fig.add_subplot(111, aspect="equal")
|
|
draw_pdf(table, ax)
|
|
draw_parse_constraints(table, ax)
|
|
return ax
|
|
|
|
|
|
class PlotMethods():
|
|
def __call__(self, table, kind="text", filename=None, ax=None):
|
|
"""Plot elements found on PDF page based on kind
|
|
specified, useful for debugging and playing with different
|
|
parameters to get the best output.
|
|
|
|
Parameters
|
|
----------
|
|
table: camelot.core.Table
|
|
A Camelot Table.
|
|
kind : str, optional (default: 'text')
|
|
{'text', 'grid', 'contour', 'joint', 'line',
|
|
'network_table_search'}
|
|
The element type for which a plot should be generated.
|
|
filepath: str, optional (default: None)
|
|
Absolute path for saving the generated plot.
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
if not _HAS_MPL:
|
|
raise ImportError("matplotlib is required for plotting.")
|
|
|
|
if table.flavor == "lattice" and kind in ["textedge"]:
|
|
raise NotImplementedError(
|
|
f"Lattice flavor does not support kind='{kind}'"
|
|
)
|
|
if table.flavor != "lattice" and kind in ["line"]:
|
|
raise NotImplementedError(
|
|
f"{table.flavor} flavor does not support kind='{kind}'"
|
|
)
|
|
|
|
plot_method = getattr(self, kind)
|
|
return plot_method(table, ax)
|
|
|
|
@staticmethod
|
|
def text(table, ax=None):
|
|
"""Generates a plot for all text elements present
|
|
on the PDF page.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
draw_text(table, ax)
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def grid(table, ax=None):
|
|
"""Generates a plot for the detected table grids
|
|
on the PDF page.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
for row in table.cells:
|
|
for cell in row:
|
|
if cell.left:
|
|
ax.plot([cell.lb[0], cell.lt[0]], [cell.lb[1], cell.lt[1]])
|
|
if cell.right:
|
|
ax.plot([cell.rb[0], cell.rt[0]], [cell.rb[1], cell.rt[1]])
|
|
if cell.top:
|
|
ax.plot([cell.lt[0], cell.rt[0]], [cell.lt[1], cell.rt[1]])
|
|
if cell.bottom:
|
|
ax.plot([cell.lb[0], cell.rb[0]], [cell.lb[1], cell.rb[1]])
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def contour(table, ax=None):
|
|
"""Generates a plot for all table boundaries present
|
|
on the PDF page.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
_FOR_LATTICE = table.flavor == "lattice"
|
|
ax = prepare_plot(table, ax)
|
|
|
|
if not _FOR_LATTICE:
|
|
draw_text(table, ax)
|
|
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(table._bbox[0], table._bbox[1]),
|
|
table._bbox[2] - table._bbox[0],
|
|
table._bbox[3] - table._bbox[1],
|
|
fill=False, color="red"
|
|
)
|
|
)
|
|
if not _FOR_LATTICE:
|
|
extend_axe_lim(ax, table._bbox)
|
|
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def textedge(table, ax=None):
|
|
"""Generates a plot for relevant textedges.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
draw_text(table, ax)
|
|
|
|
if table.flavor == "network":
|
|
for network in table.parse_details["network_searches"]:
|
|
most_connected_tl = network.most_connected_textline()
|
|
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(most_connected_tl.x0, most_connected_tl.y0),
|
|
most_connected_tl.x1 - most_connected_tl.x0,
|
|
most_connected_tl.y1 - most_connected_tl.y0,
|
|
color="red",
|
|
alpha=0.5
|
|
)
|
|
)
|
|
for tl in sorted(
|
|
network._textline_to_alignments.keys(),
|
|
key=lambda textline: (-textline.y0, textline.x0)
|
|
):
|
|
alignments = network._textline_to_alignments[tl]
|
|
coords = get_textline_coords(tl)
|
|
alignment_id_h, tls_h = alignments.max_v()
|
|
alignment_id_v, tls_v = alignments.max_h()
|
|
xs = list(map(lambda tl: tl.x0, tls_v))
|
|
ys = list(map(lambda tl: tl.y1, tls_h))
|
|
top_h = max(ys)
|
|
ax.text(
|
|
coords[alignment_id_h],
|
|
top_h + 5,
|
|
"{max_h_count}".format(max_h_count=len(tls_h)),
|
|
verticalalignment="bottom",
|
|
horizontalalignment="center",
|
|
fontsize=8,
|
|
color="green"
|
|
)
|
|
ax.plot(
|
|
[coords[alignment_id_h]] * len(ys), ys,
|
|
color="green",
|
|
linestyle="solid",
|
|
linewidth=1,
|
|
marker="o",
|
|
markersize=3
|
|
)
|
|
|
|
left_v = min(map(lambda tl: tl.x0, tls_v))
|
|
ax.text(
|
|
left_v - 5,
|
|
coords[alignment_id_v],
|
|
"{max_v_count}".format(max_v_count=len(tls_v)),
|
|
verticalalignment="center",
|
|
horizontalalignment="right",
|
|
fontsize=8,
|
|
color="blue"
|
|
)
|
|
ax.plot(
|
|
xs, [coords[alignment_id_v]] * len(xs),
|
|
color="blue",
|
|
linestyle="solid",
|
|
linewidth=1,
|
|
marker="o",
|
|
markersize=3
|
|
)
|
|
else:
|
|
for te in table._textedges:
|
|
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def joint(table, ax=None):
|
|
"""Generates a plot for all line intersections present
|
|
on the PDF page.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
x_coord = []
|
|
y_coord = []
|
|
for coord in table.parse["joints"]:
|
|
x_coord.append(coord[0])
|
|
y_coord.append(coord[1])
|
|
ax.plot(x_coord, y_coord, "ro")
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def line(table, ax=None):
|
|
"""Generates a plot for all line segments present
|
|
on the PDF page.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
vertical, horizontal = table._segments
|
|
for v in vertical:
|
|
ax.plot([v[0], v[2]], [v[1], v[3]])
|
|
for h in horizontal:
|
|
ax.plot([h[0], h[2]], [h[1], h[3]])
|
|
return ax.get_figure()
|
|
|
|
@staticmethod
|
|
def network_table_search(table, ax=None):
|
|
"""Generates a plot illustrating the steps of the network table search.
|
|
|
|
Parameters
|
|
----------
|
|
table : camelot.core.Table
|
|
ax : matplotlib.axes.Axes (optional)
|
|
|
|
Returns
|
|
-------
|
|
fig : matplotlib.fig.Figure
|
|
|
|
"""
|
|
ax = prepare_plot(table, ax)
|
|
if table.parse_details is None:
|
|
return ax.get_figure()
|
|
parse_details = table.parse_details
|
|
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
|
max_h_gap = bbox_search["max_h_gap"]
|
|
max_v_gap = bbox_search["max_v_gap"]
|
|
iterations = bbox_search["iterations"]
|
|
for iteration, bbox in enumerate(iterations):
|
|
final = iteration == len(iterations) - 1
|
|
|
|
draw_labeled_bbox(
|
|
ax, bbox,
|
|
"t{box_id}/i{iteration}".format(
|
|
box_id=box_id,
|
|
iteration=iteration
|
|
),
|
|
color="red",
|
|
linewidth=5 if final else 2,
|
|
fontsize=12 if final else 8,
|
|
label_pos="bottom,left"
|
|
)
|
|
|
|
ax.add_patch(
|
|
patches.Rectangle(
|
|
(bbox[0]-max_h_gap, bbox[1]-max_v_gap),
|
|
bbox[2] - bbox[0] + 2 * max_h_gap,
|
|
bbox[3] - bbox[1] + 2 * max_v_gap,
|
|
color="orange",
|
|
fill=False
|
|
)
|
|
)
|
|
|
|
for box_id, col_search in enumerate(parse_details["col_searches"]):
|
|
draw_labeled_bbox(
|
|
ax, col_search["bbox_full"],
|
|
"box body + header #{box_id}".format(
|
|
box_id=box_id
|
|
),
|
|
color="red",
|
|
linewidth=4,
|
|
label_pos="top,left"
|
|
)
|
|
draw_labeled_bbox(
|
|
ax, col_search["bbox_body"],
|
|
"box body #{box_id}".format(
|
|
box_id=box_id
|
|
),
|
|
color="orange",
|
|
linewidth=2,
|
|
label_pos="bottom,left"
|
|
)
|
|
for col_anchor in col_search["cols_anchors"]:
|
|
# Display a green line at the col boundary line throughout the
|
|
# table bbox.
|
|
ax.plot(
|
|
[col_anchor, col_anchor],
|
|
[
|
|
col_search["bbox_body"][1] - 10,
|
|
col_search["bbox_body"][3] + 10,
|
|
],
|
|
color="green"
|
|
)
|
|
|
|
return ax.get_figure()
|