Move generic code to utils

pull/153/head
Frh 2020-04-22 19:08:06 -07:00
parent 14cd328644
commit 7ad5b843ab
2 changed files with 65 additions and 66 deletions

View File

@ -9,6 +9,7 @@ import warnings
from .base import BaseParser from .base import BaseParser
from ..utils import ( from ..utils import (
get_index_closest_point,
get_textline_coords, get_textline_coords,
bbox_from_str, bbox_from_str,
text_in_bbox, text_in_bbox,
@ -264,56 +265,6 @@ class TextEdges2(object):
self.max_rows = None self.max_rows = None
self.max_cols = None self.max_cols = None
# FRHTODO: Move to utils and use generic name
@staticmethod
def _get_index_closest_point(coord, edge_array):
"""Returns the index of the closest point
"""
n = len(edge_array)
if n == 0:
return None
if n == 1:
return 0
left = 0
right = n - 1
mid = 0
if coord >= edge_array[n - 1].coord:
return n - 1
if coord <= edge_array[0].coord:
return 0
while left < right:
mid = (left + right) // 2 # find the mid
if coord < edge_array[mid].coord:
right = mid
elif coord > edge_array[mid].coord:
left = mid + 1
else:
return mid
if edge_array[mid].coord > coord:
if mid > 0 and (
coord - edge_array[mid-1].coord <
edge_array[mid].coord - coord):
return mid-1
elif edge_array[mid].coord < coord:
if mid < n - 1 and (
edge_array[mid+1].coord - coord <
coord - edge_array[mid].coord):
return mid+1
return mid
# def insert(self, index, textline, align):
# """Adds a new text edge to the current dict.
# """
# x = self.get_x_coord(textline, align)
# y0 = textline.y0
# y1 = textline.y1
# te = TextEdge(x, y0, y1, align=align)
# self._textedges[align].insert(index, te)
def _register_textline(self, textline): def _register_textline(self, textline):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
@ -323,7 +274,9 @@ class TextEdges2(object):
coord = coords[alignment] coord = coords[alignment]
# Find the index of the closest existing element (or 0 if none) # Find the index of the closest existing element (or 0 if none)
idx_closest = self._get_index_closest_point(coord, edge_array) idx_closest = get_index_closest_point(
coord, edge_array, fn=lambda x: x.coord
)
# Check if the edges before/after are close enough # Check if the edges before/after are close enough
# that it can be considered aligned # that it can be considered aligned
@ -353,19 +306,15 @@ class TextEdges2(object):
def _compute_alignment_counts(self): def _compute_alignment_counts(self):
"""Build a dictionary textline -> alignment object. """Build a dictionary textline -> alignment object.
""" """
#
for edge_name, textedges in self._textedges.items(): for edge_name, textedges in self._textedges.items():
for textedge in textedges: for textedge in textedges:
for textline in textedge.textlines: for textline in textedge.textlines:
textline_alignments = self._textlines_alignments.get( alignments = self._textlines_alignments.get(
textline, None) textline, None)
if textline_alignments is None: if alignments is None:
alignments = Alignments() alignments = Alignments()
alignments[edge_name] = len(textedge.textlines)
self._textlines_alignments[textline] = alignments self._textlines_alignments[textline] = alignments
else: alignments[edge_name] = len(textedge.textlines)
textline_alignments[edge_name] = len(
textedge.textlines)
# Finally calculate the overall maximum number of rows/cols # Finally calculate the overall maximum number of rows/cols
self.max_rows = max( self.max_rows = max(
@ -493,9 +442,10 @@ class TextEdges2(object):
v_coord = most_aligned_coords[ref_v_edge_name] v_coord = most_aligned_coords[ref_v_edge_name]
h_textlines = sorted( h_textlines = sorted(
best_h_textedges[ best_h_textedges[
TextEdges2._get_index_closest_point( get_index_closest_point(
h_coord, h_coord,
best_h_textedges best_h_textedges,
fn=lambda x: x.coord
) )
].textlines, ].textlines,
key=lambda tl: tl.x0, key=lambda tl: tl.x0,
@ -503,9 +453,10 @@ class TextEdges2(object):
) )
v_textlines = sorted( v_textlines = sorted(
best_v_textedges[ best_v_textedges[
TextEdges2._get_index_closest_point( get_index_closest_point(
v_coord, v_coord,
best_v_textedges best_v_textedges,
fn=lambda x: x.coord
) )
].textlines, ].textlines,
key=lambda tl: tl.y0, key=lambda tl: tl.y0,
@ -550,10 +501,6 @@ class TextEdges2(object):
# Calculate the 75th percentile of the horizontal/vertical # Calculate the 75th percentile of the horizontal/vertical
# gaps between textlines. Use this as a reference for a threshold # gaps between textlines. Use this as a reference for a threshold
# to not exceed while looking for table boundaries. # to not exceed while looking for table boundaries.
# FRHTODO: Clean this up
# gaps_hv = self._calculate_gaps_thresholds(75)
# if (gaps_hv[0] is None or gaps_hv[1] is None):
# return None
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1] max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
if debug_info is not None: if debug_info is not None:

View File

@ -560,6 +560,58 @@ def find_columns_coordinates(tls):
return cols_anchors return cols_anchors
def get_index_closest_point(point, sorted_list, fn=lambda x: x):
"""Return the index of the closest point in the sorted list.
Parameters
----------
point : the reference sortable element to search.
sorted_list : list
fn: optional accessor function
Returns
-------
index : int
"""
n = len(sorted_list)
if n == 0:
return None
if n == 1:
return 0
left = 0
right = n - 1
mid = 0
if point >= fn(sorted_list[n - 1]):
return n - 1
if point <= fn(sorted_list[0]):
return 0
while left < right:
mid = (left + right) // 2 # find the mid
mid_val = fn(sorted_list[mid])
if point < mid_val:
right = mid
elif point > mid_val:
left = mid + 1
else:
return mid
if mid_val > point:
if mid > 0 and (
point - fn(sorted_list[mid-1]) <
mid_val - point):
return mid-1
elif mid_val < point:
if mid < n - 1 and (
fn(sorted_list[mid+1]) - point <
point - mid_val):
return mid+1
return mid
def distance_tl_to_bbox(tl, bbox): def distance_tl_to_bbox(tl, bbox):
"""Returns a tuple corresponding to the horizontal and vertical gaps """Returns a tuple corresponding to the horizontal and vertical gaps
between a textline and a bbox. between a textline and a bbox.