Further refactoring

pull/153/head
Frh 2020-04-24 21:11:31 -07:00
parent f42557ab8b
commit bb842f21b9
8 changed files with 430 additions and 699 deletions

View File

@ -15,8 +15,6 @@ from .utils import (
get_index_closest_point, get_index_closest_point,
get_textline_coords, get_textline_coords,
build_file_path_in_temp_dir, build_file_path_in_temp_dir,
compute_accuracy,
compute_whitespace,
export_pdf_as_png export_pdf_as_png
) )
@ -141,9 +139,9 @@ class TextAlignments(object):
def __init__(self, alignment_names): def __init__(self, alignment_names):
# For each possible alignment, list of tuples coordinate/textlines # For each possible alignment, list of tuples coordinate/textlines
self._textedges = {} self._text_alignments = {}
for alignment_name in alignment_names: for alignment_name in alignment_names:
self._textedges[alignment_name] = [] self._text_alignments[alignment_name] = []
@staticmethod @staticmethod
def _create_new_text_alignment(coord, textline, align): def _create_new_text_alignment(coord, textline, align):
@ -156,12 +154,12 @@ class TextAlignments(object):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
coords = get_textline_coords(textline) coords = get_textline_coords(textline)
for alignment, edge_array in self._textedges.items(): for alignment_id, alignment_array in self._text_alignments.items():
coord = coords[alignment] coord = coords[alignment_id]
# Find the index of the closest existing element (or 0 if none) # Find the index of the closest existing element (or 0 if none)
idx_closest = get_index_closest_point( idx_closest = get_index_closest_point(
coord, edge_array, fn=lambda x: x.coord coord, alignment_array, fn=lambda x: x.coord
) )
# Check if the edges before/after are close enough # Check if the edges before/after are close enough
@ -169,17 +167,25 @@ class TextAlignments(object):
idx_insert = None idx_insert = None
if idx_closest is None: if idx_closest is None:
idx_insert = 0 idx_insert = 0
elif np.isclose(edge_array[idx_closest].coord, coord, atol=0.5): elif np.isclose(
self._update_edge(edge_array[idx_closest], coord, textline) alignment_array[idx_closest].coord,
elif edge_array[idx_closest].coord < coord: coord,
atol=0.5
):
self._update_edge(
alignment_array[idx_closest],
coord,
textline
)
elif alignment_array[idx_closest].coord < coord:
idx_insert = idx_closest + 1 idx_insert = idx_closest + 1
else: else:
idx_insert = idx_closest idx_insert = idx_closest
if idx_insert is not None: if idx_insert is not None:
new_edge = self._create_new_text_alignment( new_alignment = self._create_new_text_alignment(
coord, textline, alignment coord, textline, alignment_id
) )
edge_array.insert(idx_insert, new_edge) alignment_array.insert(idx_insert, new_alignment)
class TextEdges(TextAlignments): class TextEdges(TextAlignments):
@ -201,7 +207,7 @@ class TextEdges(TextAlignments):
"""Adds a new text edge to the current dict. """Adds a new text edge to the current dict.
""" """
te = self._create_new_text_alignment(coord, textline, align) te = self._create_new_text_alignment(coord, textline, align)
self._textedges[align].append(te) self._text_alignments[align].append(te)
def _update_edge(self, edge, coord, textline): def _update_edge(self, edge, coord, textline):
edge.update_coords(coord, textline, self.edge_tol) edge.update_coords(coord, textline, self.edge_tol)
@ -221,15 +227,15 @@ class TextEdges(TextAlignments):
""" """
intersections_sum = { intersections_sum = {
"left": sum( "left": sum(
len(te.textlines) for te in self._textedges["left"] len(te.textlines) for te in self._text_alignments["left"]
if te.is_valid if te.is_valid
), ),
"right": sum( "right": sum(
len(te.textlines) for te in self._textedges["right"] len(te.textlines) for te in self._text_alignments["right"]
if te.is_valid if te.is_valid
), ),
"middle": sum( "middle": sum(
len(te.textlines) for te in self._textedges["middle"] len(te.textlines) for te in self._text_alignments["middle"]
if te.is_valid if te.is_valid
), ),
} }
@ -240,7 +246,7 @@ class TextEdges(TextAlignments):
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0] relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
return list(filter( return list(filter(
lambda te: te.is_valid, lambda te: te.is_valid,
self._textedges[relevant_align]) self._text_alignments[relevant_align])
) )
def get_table_areas(self, textlines, relevant_textedges): def get_table_areas(self, textlines, relevant_textedges):
@ -443,9 +449,9 @@ class Table(object):
self.filename = None self.filename = None
self.order = None self.order = None
self.page = None self.page = None
self.flavor = None # Flavor of the parser that generated the table self.flavor = None # Flavor of the parser used
self.pdf_size = None # Dimensions of the original PDF page self.pdf_size = None # Dimensions of the original PDF page
self.debug_info = None # Field holding debug data self.parse_details = None # Field holding debug data
self._image = None self._image = None
self._image_path = None # Temporary file to hold an image of the pdf self._image_path = None # Temporary file to hold an image of the pdf
@ -485,31 +491,6 @@ class Table(object):
} }
return report return report
def record_parse_metadata(self, parser):
"""Record data about the origin of the table
"""
self.flavor = parser.id
self.filename = parser.filename
self.debug_info = parser.debug_info
pos_errors = parser.compute_parse_errors(self)
self.accuracy = compute_accuracy([[100, pos_errors]])
if parser.copy_text is not None:
self.copy_spanning_text(parser.copy_text)
data = self.data
self.df = pd.DataFrame(data)
self.shape = self.df.shape
self.whitespace = compute_whitespace(data)
self.pdf_size = (parser.pdf_width, parser.pdf_height)
_text = []
_text.extend(
[(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
self._text = _text
def get_pdf_image(self): def get_pdf_image(self):
"""Compute pdf image and cache it """Compute pdf image and cache it
""" """

View File

@ -3,11 +3,18 @@
import os import os
import warnings import warnings
import numpy as np
import pandas as pd
from ..utils import ( from ..utils import (
bbox_from_str,
bbox_from_textlines,
compute_accuracy,
compute_whitespace,
get_text_objects, get_text_objects,
get_table_index, get_table_index,
text_in_bbox, text_in_bbox,
bbox_from_str, text_in_bbox_per_axis,
) )
from ..core import Table from ..core import Table
@ -42,7 +49,7 @@ class BaseParser(object):
self.t_bbox = None self.t_bbox = None
# For plotting details of parsing algorithms # For plotting details of parsing algorithms
self.debug_info = {} if debug else None self.parse_details = {} if debug else None
def prepare_page_parse(self, filename, layout, dimensions, def prepare_page_parse(self, filename, layout, dimensions,
page_idx, layout_kwargs): page_idx, layout_kwargs):
@ -63,9 +70,9 @@ class BaseParser(object):
self.pdf_width, self.pdf_height = self.dimensions self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename) self.rootname, __ = os.path.splitext(self.filename)
if self.debug_info is not None: if self.parse_details is not None:
self.debug_info["table_regions"] = self.table_regions self.parse_details["table_regions"] = self.table_regions
self.debug_info["table_areas"] = self.table_areas self.parse_details["table_areas"] = self.table_areas
def _apply_regions_filter(self, textlines): def _apply_regions_filter(self, textlines):
"""If regions have been specified, filter textlines to these regions. """If regions have been specified, filter textlines to these regions.
@ -194,6 +201,31 @@ class BaseParser(object):
return _tables return _tables
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
table.flavor = self.id
table.filename = self.filename
table.parse_details = self.parse_details
pos_errors = self.compute_parse_errors(table)
table.accuracy = compute_accuracy([[100, pos_errors]])
if self.copy_text is not None:
table.copy_spanning_text(self.copy_text)
data = table.data
table.df = pd.DataFrame(data)
table.shape = table.df.shape
table.whitespace = compute_whitespace(data)
table.pdf_size = (self.pdf_width, self.pdf_height)
_text = []
_text.extend(
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
class TextBaseParser(BaseParser): class TextBaseParser(BaseParser):
"""Base class for all text parsers. """Base class for all text parsers.
@ -211,15 +243,17 @@ class TextBaseParser(BaseParser):
edge_tol=50, edge_tol=50,
row_tol=2, row_tol=2,
column_tol=0, column_tol=0,
debug=False,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
"stream", parser_id,
table_regions=table_regions, table_regions=table_regions,
table_areas=table_areas, table_areas=table_areas,
split_text=split_text, split_text=split_text,
strip_text=strip_text, strip_text=strip_text,
flag_size=flag_size, flag_size=flag_size,
debug=debug,
) )
self.columns = columns self.columns = columns
self._validate_columns() self._validate_columns()
@ -227,4 +261,271 @@ class TextBaseParser(BaseParser):
self.row_tol = row_tol self.row_tol = row_tol
self.column_tol = column_tol self.column_tol = column_tol
self.textedges = None @staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = None
rows = []
temp = []
non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]):
if row_y is None:
row_y = t.y0
elif not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = TextBaseParser._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
]
cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._bbox = self.table_bbox
table._segments = None
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
self.record_parse_metadata(table)
return table

View File

@ -5,7 +5,6 @@ from __future__ import division
import numpy as np import numpy as np
import copy import copy
import warnings
from .base import TextBaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
@ -17,7 +16,6 @@ from ..core import (
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
text_in_bbox, text_in_bbox,
text_in_bbox_per_axis,
bbox_from_textlines, bbox_from_textlines,
distance_tl_to_bbox, distance_tl_to_bbox,
find_columns_coordinates find_columns_coordinates
@ -142,11 +140,11 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
class AlignmentCounter(object): class AlignmentCounter(object):
""" """
Represents all textlines aligned with a textline for each alignment. For a given textline, represent all other textlines aligned with it.
A textline can be vertically aligned with others by having matching left, A textline can be vertically aligned with others if their bbox match on
right, or middle edge, and horizontally aligned by having matching top, left, right, or middle coord, and horizontally aligned if they match top,
bottom, or center edge. bottom, or center coord.
""" """
@ -210,15 +208,15 @@ class AlignmentCounter(object):
class TextNetworks(TextAlignments): class TextNetworks(TextAlignments):
"""Text elements connected via both vertical (top, bottom, middle) and """Text elements connected by vertical AND horizontal alignments.
horizontal (left, right, and middle) alignments found on the PDF page.
The alignment dict has six keys based on the hor/vert alignments, The alignment dict has six keys based on the hor/vert alignments,
and each key's value is a list of camelot.core.TextAlignment objects. and each key's value is a list of camelot.core.TextAlignment objects.
""" """
def __init__(self): def __init__(self):
super().__init__(ALL_ALIGNMENTS) super().__init__(ALL_ALIGNMENTS)
# For each textline, dictionary "edge type" to # For each textline, dictionary "alignment type" to
# "number of textlines aligned" # "number of textlines aligned"
self._textlines_alignments = {} self._textlines_alignments = {}
@ -226,10 +224,10 @@ class TextNetworks(TextAlignments):
edge.register_aligned_textline(textline, coord) edge.register_aligned_textline(textline, coord)
def _register_all_text_lines(self, textlines): def _register_all_text_lines(self, textlines):
"""Add all textlines to our edge repository to """Add all textlines to our network repository to
identify alignments. identify alignments.
""" """
# Identify all the edge alignments # Identify all the alignments
for tl in textlines: for tl in textlines:
if len(tl.get_text().strip()) > 0: if len(tl.get_text().strip()) > 0:
self._register_textline(tl) self._register_textline(tl)
@ -237,7 +235,7 @@ class TextNetworks(TextAlignments):
def _compute_alignment_counts(self): def _compute_alignment_counts(self):
"""Build a dictionary textline -> alignment object. """Build a dictionary textline -> alignment object.
""" """
for align_id, textedges in self._textedges.items(): for align_id, textedges in self._text_alignments.items():
for textedge in textedges: for textedge in textedges:
for textline in textedge.textlines: for textline in textedge.textlines:
alignments = self._textlines_alignments.get( alignments = self._textlines_alignments.get(
@ -254,8 +252,8 @@ class TextNetworks(TextAlignments):
the core table. the core table.
""" """
h_gaps, v_gaps = [], [] h_gaps, v_gaps = [], []
for align_id in self._textedges: for align_id in self._text_alignments:
edge_array = self._textedges[align_id] edge_array = self._text_alignments[align_id]
gaps = [] gaps = []
vertical = align_id in HORIZONTAL_ALIGNMENTS vertical = align_id in HORIZONTAL_ALIGNMENTS
sort_function = (lambda tl: tl.y0) \ sort_function = (lambda tl: tl.y0) \
@ -299,7 +297,7 @@ class TextNetworks(TextAlignments):
removed_singletons = True removed_singletons = True
while removed_singletons: while removed_singletons:
removed_singletons = False removed_singletons = False
for alignment_id, textalignments in self._textedges.items(): for alignment_id, textalignments in self._text_alignments.items():
# For each alignment edge, remove items if they are singletons # For each alignment edge, remove items if they are singletons
# either horizontally or vertically # either horizontally or vertically
for ta in textalignments: for ta in textalignments:
@ -313,7 +311,7 @@ class TextNetworks(TextAlignments):
self._textlines_alignments = {} self._textlines_alignments = {}
self._compute_alignment_counts() self._compute_alignment_counts()
def _most_connected_textline(self): def most_connected_textline(self):
""" Retrieve the textline that is most connected across vertical and """ Retrieve the textline that is most connected across vertical and
horizontal axis. horizontal axis.
@ -340,7 +338,7 @@ class TextNetworks(TextAlignments):
# alignments across horizontal and vertical axis. # alignments across horizontal and vertical axis.
# It will serve as a reference axis along which to collect the average # It will serve as a reference axis along which to collect the average
# spacing between rows/cols. # spacing between rows/cols.
most_aligned_tl = self._most_connected_textline() most_aligned_tl = self.most_connected_textline()
if most_aligned_tl is None: if most_aligned_tl is None:
return None return None
@ -378,7 +376,7 @@ class TextNetworks(TextAlignments):
) )
return gaps_hv return gaps_hv
def _build_bbox_candidate(self, gaps_hv, debug_info=None): def _build_bbox_candidate(self, gaps_hv, parse_details=None):
""" Seed the process with the textline with the highest alignment """ Seed the process with the textline with the highest alignment
score, then expand the bbox with textlines within threshold. score, then expand the bbox with textlines within threshold.
@ -387,7 +385,7 @@ class TextNetworks(TextAlignments):
gaps_hv : tuple gaps_hv : tuple
The maximum distance allowed to consider surrounding lines/columns The maximum distance allowed to consider surrounding lines/columns
as part of the same table. as part of the same table.
debug_info : array (optional) parse_details : array (optional)
Optional parameter array, in which to store extra information Optional parameter array, in which to store extra information
to help later visualization of the table creation. to help later visualization of the table creation.
""" """
@ -396,23 +394,23 @@ class TextNetworks(TextAlignments):
# It will serve both as a starting point for the table boundary # It will serve both as a starting point for the table boundary
# search, and as a way to estimate the average spacing between # search, and as a way to estimate the average spacing between
# rows/cols. # rows/cols.
most_aligned_tl = self._most_connected_textline() most_aligned_tl = self.most_connected_textline()
# Calculate the 75th percentile of the horizontal/vertical # Calculate the 75th percentile of the horizontal/vertical
# gaps between textlines. Use this as a reference for a threshold # gaps between textlines. Use this as a reference for a threshold
# to not exceed while looking for table boundaries. # to not exceed while looking for table boundaries.
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1] max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
if debug_info is not None: if parse_details is not None:
# Store debug info # Store debug info
debug_info_search = { parse_details_search = {
"max_h_gap": max_h_gap, "max_h_gap": max_h_gap,
"max_v_gap": max_v_gap, "max_v_gap": max_v_gap,
"iterations": [] "iterations": []
} }
debug_info.append(debug_info_search) parse_details.append(parse_details_search)
else: else:
debug_info_search = None parse_details_search = None
MINIMUM_TEXTLINES_IN_TABLE = 6 MINIMUM_TEXTLINES_IN_TABLE = 6
bbox = (most_aligned_tl.x0, most_aligned_tl.y0, bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
@ -426,9 +424,9 @@ class TextNetworks(TextAlignments):
tls_in_bbox = [most_aligned_tl] tls_in_bbox = [most_aligned_tl]
last_bbox = None last_bbox = None
while last_bbox != bbox: while last_bbox != bbox:
if debug_info_search is not None: if parse_details_search is not None:
# Store debug info # Store debug info
debug_info_search["iterations"].append(bbox) parse_details_search["iterations"].append(bbox)
last_bbox = bbox last_bbox = bbox
# Go through all remaining textlines, expand our bbox # Go through all remaining textlines, expand our bbox
@ -461,35 +459,6 @@ class TextNetworks(TextAlignments):
self._register_all_text_lines(textlines) self._register_all_text_lines(textlines)
self._compute_alignment_counts() self._compute_alignment_counts()
def plot_alignments(self, ax):
"""Displays a visualization of the alignments as currently computed.
"""
# FRHTODO: This is too busy and doesn't plot lines
most_aligned_tl = sorted(
self._textlines_alignments.keys(),
key=lambda textline:
self._textlines_alignments[textline].alignment_score(),
reverse=True
)[0]
ax.add_patch(
patches.Rectangle(
(most_aligned_tl.x0, most_aligned_tl.y0),
most_aligned_tl.x1 - most_aligned_tl.x0,
most_aligned_tl.y1 - most_aligned_tl.y0,
color="red",
alpha=0.5
)
)
for tl, alignments in self._textlines_alignments.items():
ax.text(
tl.x0 - 5,
tl.y0 - 5,
f"{alignments.max_h_count()}x{alignments.max_v_count()}",
fontsize=5,
color="black"
)
class Hybrid(TextBaseParser): class Hybrid(TextBaseParser):
"""Hybrid method of parsing looks for spaces between text """Hybrid method of parsing looks for spaces between text
@ -555,190 +524,9 @@ class Hybrid(TextBaseParser):
edge_tol=edge_tol, edge_tol=edge_tol,
row_tol=row_tol, row_tol=row_tol,
column_tol=column_tol, column_tol=column_tol,
debug=debug,
) )
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = None
rows = []
temp = []
non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]):
if row_y is None:
row_y = t.y0
elif not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
return rows
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _add_columns(cols, text, row_tol):
"""Add columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = Hybrid._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
]
cols.extend(Hybrid._merge_columns(sorted(new_cols)))
return cols
# FRHTODO: Check if needed, refactor with Stream
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
# FRHTODO: Check is needed, refactor with Stream
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _generate_table_bbox(self): def _generate_table_bbox(self):
if self.table_areas is not None: if self.table_areas is not None:
table_bbox = {} table_bbox = {}
@ -756,25 +544,21 @@ class Hybrid(TextBaseParser):
textlines_processed = {} textlines_processed = {}
self.table_bbox = {} self.table_bbox = {}
if self.debug_info is not None: if self.parse_details is not None:
debug_info_edges_searches = [] parse_details_network_searches = []
self.debug_info["edges_searches"] = debug_info_edges_searches self.parse_details["network_searches"] = \
debug_info_bboxes_searches = [] parse_details_network_searches
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches parse_details_bbox_searches = []
self.parse_details["bbox_searches"] = parse_details_bbox_searches
else: else:
debug_info_edges_searches = None parse_details_network_searches = None
debug_info_bboxes_searches = None parse_details_bbox_searches = None
while True: while True:
self.textedges = TextNetworks() text_network = TextNetworks()
self.textedges.generate(textlines) text_network.generate(textlines)
self.textedges._remove_unconnected_edges() text_network._remove_unconnected_edges()
if debug_info_edges_searches is not None: gaps_hv = text_network._compute_plausible_gaps()
# Preserve the current edge calculation for display debugging
debug_info_edges_searches.append(
copy.deepcopy(self.textedges)
)
gaps_hv = self.textedges._compute_plausible_gaps()
if gaps_hv is None: if gaps_hv is None:
return None return None
# edge_tol instructions override the calculated vertical gap # edge_tol instructions override the calculated vertical gap
@ -782,13 +566,19 @@ class Hybrid(TextBaseParser):
gaps_hv[0], gaps_hv[0],
gaps_hv[1] if self.edge_tol is None else self.edge_tol gaps_hv[1] if self.edge_tol is None else self.edge_tol
) )
bbox = self.textedges._build_bbox_candidate( bbox = text_network._build_bbox_candidate(
edge_tol_hv, edge_tol_hv,
debug_info=debug_info_bboxes_searches parse_details=parse_details_bbox_searches
) )
if bbox is None: if bbox is None:
break break
if parse_details_network_searches is not None:
# Preserve the current edge calculation for display debugging
parse_details_network_searches.append(
copy.deepcopy(text_network)
)
# Get all the textlines that are at least 50% in the box # Get all the textlines that are at least 50% in the box
tls_in_bbox = text_in_bbox(bbox, textlines) tls_in_bbox = text_in_bbox(bbox, textlines)
@ -808,10 +598,10 @@ class Hybrid(TextBaseParser):
gaps_hv[1] gaps_hv[1]
) )
if self.debug_info is not None: if self.parse_details is not None:
if "col_searches" not in self.debug_info: if "col_searches" not in self.parse_details:
self.debug_info["col_searches"] = [] self.parse_details["col_searches"] = []
self.debug_info["col_searches"].append({ self.parse_details["col_searches"].append({
"core_bbox": bbox, "core_bbox": bbox,
"cols_anchors": cols_anchors, "cols_anchors": cols_anchors,
"expanded_bbox": expanded_bbox "expanded_bbox": expanded_bbox
@ -826,95 +616,3 @@ class Hybrid(TextBaseParser):
lambda tl: tl not in textlines_processed, lambda tl: tl not in textlines_processed,
textlines textlines
)) ))
# FRHTODO: Check is needed, refactor with Stream
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None
# FRHTODO: Check is needed, refactor with Stream
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
table.record_parse_metadata(self)
# for plotting
table._bbox = self.table_bbox
table._segments = None
table._textedges = self.textedges
return table

View File

@ -168,6 +168,15 @@ class Lattice(BaseParser):
indices.append((r_idx, c_idx, text)) indices.append((r_idx, c_idx, text))
return indices return indices
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
# for plotting
table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments)
def _generate_table_bbox(self): def _generate_table_bbox(self):
def scale_areas(areas): def scale_areas(areas):
scaled_areas = [] scaled_areas = []
@ -293,12 +302,5 @@ class Lattice(BaseParser):
# set spanning cells to True # set spanning cells to True
table = table.set_span() table = table.set_span()
table.record_parse_metadata(self) self.record_parse_metadata(table)
# for plotting
table._image = self.pdf_image # Reuse the image used for calc
table._bbox_unscaled = self.table_bbox_unscaled
table._segments = (self.vertical_segments, self.horizontal_segments)
table._textedges = None
return table return table

View File

@ -1,17 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import division from __future__ import division
import warnings
import numpy as np
from .base import TextBaseParser from .base import TextBaseParser
from ..core import TextEdges from ..core import TextEdges
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
bbox_from_textlines, text_in_bbox
text_in_bbox,
text_in_bbox_per_axis
) )
@ -79,182 +74,7 @@ class Stream(TextBaseParser):
row_tol=row_tol, row_tol=row_tol,
column_tol=column_tol, column_tol=column_tol,
) )
self.textedges = []
@staticmethod
def _group_rows(text, row_tol=2):
"""Groups PDFMiner text objects into rows vertically
within a tolerance.
Parameters
----------
text : list
List of PDFMiner text objects.
row_tol : int, optional (default: 2)
Returns
-------
rows : list
Two-dimensional list of text objects grouped into rows.
"""
row_y = None
rows = []
temp = []
non_empty_text = [t for t in text if t.get_text().strip()]
for t in non_empty_text:
# is checking for upright necessary?
# if t.get_text().strip() and all([obj.upright \
# for obj in t._objs
# if type(obj) is LTChar]):
if row_y is None:
row_y = t.y0
elif not np.isclose(row_y, t.y0, atol=row_tol):
rows.append(sorted(temp, key=lambda t: t.x0))
temp = []
# We update the row's bottom as we go, to be forgiving if there
# is a gradual change across multiple columns.
row_y = t.y0
temp.append(t)
rows.append(sorted(temp, key=lambda t: t.x0))
return rows
@staticmethod
def _merge_columns(l, column_tol=0):
"""Merges column boundaries horizontally if they overlap
or lie within a tolerance.
Parameters
----------
l : list
List of column x-coordinate tuples.
column_tol : int, optional (default: 0)
Returns
-------
merged : list
List of merged column x-coordinate tuples.
"""
merged = []
for higher in l:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
if column_tol >= 0:
if higher[0] <= lower[1] or np.isclose(
higher[0], lower[1], atol=column_tol
):
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
elif column_tol < 0:
if higher[0] <= lower[1]:
if np.isclose(higher[0], lower[1],
atol=abs(column_tol)):
merged.append(higher)
else:
upper_bound = max(lower[1], higher[1])
lower_bound = min(lower[0], higher[0])
merged[-1] = (lower_bound, upper_bound)
else:
merged.append(higher)
return merged
@staticmethod
def _join_rows(rows_grouped, text_y_max, text_y_min):
"""Makes row coordinates continuous. For the row to "touch"
we split the existing gap between them in half.
Parameters
----------
rows_grouped : list
Two-dimensional list of text objects grouped into rows.
text_y_max : int
text_y_min : int
Returns
-------
rows : list
List of continuous row y-coordinate tuples.
"""
row_boundaries = [
[
max(t.y1 for t in r),
min(t.y0 for t in r)
]
for r in rows_grouped
]
for i in range(0, len(row_boundaries)-1):
top_row = row_boundaries[i]
bottom_row = row_boundaries[i+1]
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
row_boundaries[0][0] = text_y_max
row_boundaries[-1][1] = text_y_min
return row_boundaries
@staticmethod
def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account
the text that lies outside the current column x-coordinates.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text : list
List of PDFMiner text objects.
ytol : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
if text:
text = Stream._group_rows(text, row_tol=row_tol)
elements = [len(r) for r in text]
new_cols = [
(t.x0, t.x1)
for r in text if len(r) == max(elements)
for t in r
]
cols.extend(Stream._merge_columns(sorted(new_cols)))
return cols
@staticmethod
def _join_columns(cols, text_x_min, text_x_max):
"""Makes column coordinates continuous.
Parameters
----------
cols : list
List of column x-coordinate tuples.
text_x_min : int
text_y_max : int
Returns
-------
cols : list
Updated list of column x-coordinate tuples.
"""
cols = sorted(cols)
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
return cols
def _validate_columns(self):
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _nurminen_table_detection(self, textlines): def _nurminen_table_detection(self, textlines):
"""A general implementation of the table detection algorithm """A general implementation of the table detection algorithm
@ -281,8 +101,13 @@ class Stream(TextBaseParser):
return table_bbox return table_bbox
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""
super().record_parse_metadata(table)
table._textedges = self.textedges
def _generate_table_bbox(self): def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is None: if self.table_areas is None:
hor_text = self.horizontal_text hor_text = self.horizontal_text
if self.table_regions is not None: if self.table_regions is not None:
@ -300,93 +125,3 @@ class Stream(TextBaseParser):
for area_str in self.table_areas: for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None
def _generate_table(self, table_idx, cols, rows, **kwargs):
table = self._initialize_new_table(table_idx, cols, rows)
table = table.set_all_edges()
table.record_parse_metadata(self)
# for plotting
table._bbox = self.table_bbox
table._segments = None
table._textedges = self.textedges
return table

View File

@ -87,9 +87,9 @@ def draw_parse_constraints(table, ax):
ax : matplotlib.axes.Axes ax : matplotlib.axes.Axes
""" """
if table.debug_info: if table.parse_details:
# Display a bbox per region # Display a bbox per region
for region_str in table.debug_info["table_regions"] or []: for region_str in table.parse_details["table_regions"] or []:
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox_from_str(region_str), ax, bbox_from_str(region_str),
"region: ({region_str})".format(region_str=region_str), "region: ({region_str})".format(region_str=region_str),
@ -99,7 +99,7 @@ def draw_parse_constraints(table, ax):
label_pos="bottom,right" label_pos="bottom,right"
) )
# Display a bbox per area # Display a bbox per area
for area_str in table.debug_info["table_areas"] or []: for area_str in table.parse_details["table_areas"] or []:
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox_from_str(area_str), ax, bbox_from_str(area_str),
"area: ({area_str})".format(area_str=area_str), "area: ({area_str})".format(area_str=area_str),
@ -294,8 +294,27 @@ class PlotMethods(object):
ax.set_ylim(min(ys) - 10, max(ys) + 10) ax.set_ylim(min(ys) - 10, max(ys) + 10)
if table.flavor == "hybrid": if table.flavor == "hybrid":
# FRHTODO: Clean this up for text_network in table.parse_details["network_searches"]:
table.debug_info["edges_searches"][0].plot_alignments(ax) # FRHTODO: This is too busy and doesn't plot lines
most_connected_tl = text_network.most_connected_textline()
ax.add_patch(
patches.Rectangle(
(most_connected_tl.x0, most_connected_tl.y0),
most_connected_tl.x1 - most_connected_tl.x0,
most_connected_tl.y1 - most_connected_tl.y0,
color="red",
alpha=0.5
)
)
for tl, alignments in text_network._textlines_alignments.items():
ax.text(
tl.x0 - 5,
tl.y0 - 5,
f"{alignments.max_h_count()}x{alignments.max_v_count()}",
fontsize=5,
color="black"
)
else: else:
for te in table._textedges: for te in table._textedges:
ax.plot([te.coord, te.coord], [te.y0, te.y1]) ax.plot([te.coord, te.coord], [te.y0, te.y1])
@ -372,10 +391,10 @@ class PlotMethods(object):
draw_pdf(table, ax) draw_pdf(table, ax)
draw_parse_constraints(table, ax) draw_parse_constraints(table, ax)
if table.debug_info is None: if table.parse_details is None:
return fig return fig
debug_info = table.debug_info parse_details = table.parse_details
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]): for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
max_h_gap = bbox_search["max_h_gap"] max_h_gap = bbox_search["max_h_gap"]
max_v_gap = bbox_search["max_v_gap"] max_v_gap = bbox_search["max_v_gap"]
iterations = bbox_search["iterations"] iterations = bbox_search["iterations"]
@ -403,7 +422,7 @@ class PlotMethods(object):
) )
) )
for box_id, col_search in enumerate(debug_info["col_searches"]): for box_id, col_search in enumerate(parse_details["col_searches"]):
draw_labeled_bbox( draw_labeled_bbox(
ax, col_search["expanded_bbox"], ax, col_search["expanded_bbox"],
"box body + header #{box_id}".format( "box body + header #{box_id}".format(
@ -422,10 +441,5 @@ class PlotMethods(object):
linewidth=2, linewidth=2,
label_pos="bottom,left" label_pos="bottom,left"
) )
# self.debug_info["col_searches"].append({
# "core_bbox": bbox,
# "cols_anchors": cols_anchors,
# "expanded_bbox": expanded_bbox
# })
return fig return fig

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 197 KiB

After

Width:  |  Height:  |  Size: 192 KiB