Refactor common code hybrid / stream

pull/153/head
Frh 2020-04-22 17:33:15 -07:00
parent bfc2719aff
commit 14cd328644
3 changed files with 50 additions and 50 deletions

View File

@ -12,6 +12,7 @@ import pandas as pd
from cv2 import cv2 from cv2 import cv2
from .utils import ( from .utils import (
get_textline_coords,
build_file_path_in_temp_dir, build_file_path_in_temp_dir,
compute_accuracy, compute_accuracy,
compute_whitespace, compute_whitespace,
@ -98,11 +99,8 @@ class TextEdges(object):
"""Returns the x coordinate of a text row based on the """Returns the x coordinate of a text row based on the
specified alignment. specified alignment.
""" """
x_left = textline.x0 coords = get_textline_coords(textline)
x_right = textline.x1 return coords[align]
x_middle = x_left + (x_right - x_left) / 2.0
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
return x_coord[align]
def find(self, x_coord, align): def find(self, x_coord, align):
"""Returns the index of an existing text edge using """Returns the index of an existing text edge using

View File

@ -1,4 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Implementation of hybrid table parser."""
from __future__ import division from __future__ import division
import numpy as np import numpy as np
@ -7,6 +9,7 @@ import warnings
from .base import BaseParser from .base import BaseParser
from ..utils import ( from ..utils import (
get_textline_coords,
bbox_from_str, bbox_from_str,
text_in_bbox, text_in_bbox,
text_in_bbox_per_axis, text_in_bbox_per_axis,
@ -17,30 +20,26 @@ from ..utils import (
from matplotlib import patches as patches from matplotlib import patches as patches
# FRHTODO: Move to utils
# maximum number of columns over which a header can spread # maximum number of columns over which a header can spread
MAX_COL_SPREAD_IN_HEADER = 3 MAX_COL_SPREAD_IN_HEADER = 3
def todo_move_me_expand_area_for_header(area, textlines, col_anchors, def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
max_v_gap): """Expand a bbox vertically up by looking for plausible headers.
"""The core algorithm is based on fairly strict alignment of text.
It works ok for the table body, but might fail on tables' headers The core algorithm is based on fairly strict alignment of text. It works
since they tend to be in a different font, alignment (e.g. vertical), for the table body, but might fail on tables' headers since they tend to be
etc. in a different font, alignment (e.g. vertical), etc.
The section below tries to identify whether what's above the bbox This method evalutes the area above the table body's bbox for
identified so far has the characteristics of a table header: characteristics of a table header: close to the top of the body, with cells
Close to the top of the body, with cells that fit within the bounds that fit within the horizontal bounds identified.
identified.
""" """
new_area = area new_bbox = body_bbox
(left, bottom, right, top) = area (left, bottom, right, top) = body_bbox
zones = [] zones = []
def column_spread(left, right, col_anchors): def column_spread(left, right, col_anchors):
"""Returns the number of columns (splits on the x-axis) """Get the number of columns crossed by a segment [left, right]."""
crossed by an element covering left to right.
"""
indexLeft = 0 indexLeft = 0
while indexLeft < len(col_anchors) \ while indexLeft < len(col_anchors) \
and col_anchors[indexLeft] < left: and col_anchors[indexLeft] < left:
@ -55,7 +54,7 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
keep_searching = True keep_searching = True
while keep_searching: while keep_searching:
keep_searching = False keep_searching = False
# a/ first look for the closest text element above the area. # a/ first look for the closest text element above the bbox.
# It will be the anchor for a possible new row. # It will be the anchor for a possible new row.
closest_above = None closest_above = None
all_above = [] all_above = []
@ -128,18 +127,18 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
# 1: <A1> <B1> <C1> <D1> <E1> # 1: <A1> <B1> <C1> <D1> <E1>
# 2: <A2> <B2> <C2> <D2> <E2> # 2: <A2> <B2> <C2> <D2> <E2>
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS: # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
new_area = (left, bottom, right, top) new_bbox = (left, bottom, right, top)
# At this stage we've identified a plausible row (or the # At this stage we've identified a plausible row (or the
# beginning of one). # beginning of one).
keep_searching = True keep_searching = True
return new_bbox
return new_area
class TextEdge2(object): class TextEdge2(object):
"""Defines a text edge coordinates relative to a left-bottom """Text edge coordinates relative to a left-bottom origin.
origin. (PDF coordinate space)
(PDF coordinate space)
Parameters Parameters
---------- ----------
@ -167,8 +166,7 @@ class TextEdge2(object):
f"textlines text='{text_inside}...'>" f"textlines text='{text_inside}...'>"
def register_aligned_textline(self, textline, coord): def register_aligned_textline(self, textline, coord):
"""Updates new textline to this alignment, adapting its average. """Update new textline to this alignment, adapting its average."""
"""
# Increase the intersections for this segment, expand it up, # Increase the intersections for this segment, expand it up,
# and adjust the x based on the new value # and adjust the x based on the new value
self.coord = (self.coord * len(self.textlines) + coord) / \ self.coord = (self.coord * len(self.textlines) + coord) / \
@ -177,8 +175,13 @@ class TextEdge2(object):
class Alignments(object): class Alignments(object):
"""Represents the number of other textlines aligned with this """
one across each edge. Represent the number of textlines aligned with this one across each edge.
A cell can be vertically aligned with others by having matching left,
right, or middle edge, and horizontally aligned by having matching top,
bottom, or center edge.
""" """
def __init__(self): def __init__(self):
@ -261,20 +264,6 @@ class TextEdges2(object):
self.max_rows = None self.max_rows = None
self.max_cols = None self.max_cols = None
@staticmethod
def get_textline_coords(textline):
"""Calculate the coordinates of each alignment
for a given textline.
"""
return {
"left": textline.x0,
"right": textline.x1,
"middle": (textline.x0 + textline.x1) / 2.0,
"bottom": textline.y0,
"top": textline.y1,
"center": (textline.y0 + textline.y1) / 2.0,
}
# FRHTODO: Move to utils and use generic name # FRHTODO: Move to utils and use generic name
@staticmethod @staticmethod
def _get_index_closest_point(coord, edge_array): def _get_index_closest_point(coord, edge_array):
@ -328,7 +317,7 @@ class TextEdges2(object):
def _register_textline(self, textline): def _register_textline(self, textline):
"""Updates an existing text edge in the current dict. """Updates an existing text edge in the current dict.
""" """
coords = TextEdges2.get_textline_coords(textline) coords = get_textline_coords(textline)
for alignment in self._textedges: for alignment in self._textedges:
edge_array = self._textedges[alignment] edge_array = self._textedges[alignment]
coord = coords[alignment] coord = coords[alignment]
@ -490,7 +479,7 @@ class TextEdges2(object):
# It will serve as a reference axis along which to collect the average # It will serve as a reference axis along which to collect the average
# spacing between rows/cols. # spacing between rows/cols.
most_aligned_tl = self._most_connected_textline() most_aligned_tl = self._most_connected_textline()
most_aligned_coords = TextEdges2.get_textline_coords( most_aligned_coords = get_textline_coords(
most_aligned_tl) most_aligned_tl)
# Retrieve the list of textlines it's aligned with, across both # Retrieve the list of textlines it's aligned with, across both
@ -871,7 +860,7 @@ class Hybrid(BaseParser):
# FRHTODO: Check if needed, refactor with Stream # FRHTODO: Check if needed, refactor with Stream
@staticmethod @staticmethod
def _add_columns(cols, text, row_tol): def _add_columns(cols, text, row_tol):
"""Adds columns to existing list by taking into account """Add columns to existing list by taking into account
the text that lies outside the current column x-coordinates. the text that lies outside the current column x-coordinates.
Parameters Parameters
@ -993,7 +982,7 @@ class Hybrid(BaseParser):
# Apply a heuristic to salvage headers which formatting might be # Apply a heuristic to salvage headers which formatting might be
# off compared to the rest of the table. # off compared to the rest of the table.
expanded_bbox = todo_move_me_expand_area_for_header( expanded_bbox = search_header_from_body_bbox(
bbox, bbox,
textlines, textlines,
cols_anchors, cols_anchors,

View File

@ -389,8 +389,21 @@ def segments_in_bbox(bbox, v_segments, h_segments):
return v_s, h_s return v_s, h_s
def get_textline_coords(textline):
"""Calculate the coordinates of each alignment for a given textline.
"""
return {
"left": textline.x0,
"right": textline.x1,
"middle": (textline.x0 + textline.x1) / 2.0,
"bottom": textline.y0,
"top": textline.y1,
"center": (textline.y0 + textline.y1) / 2.0,
}
def bbox_from_str(bbox_str): def bbox_from_str(bbox_str):
"""Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2) """Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
Parameters Parameters
---------- ----------