Refactor common code hybrid / stream
parent
489e996bd8
commit
36d5a09ad6
|
|
@ -12,6 +12,7 @@ import pandas as pd
|
|||
from cv2 import cv2
|
||||
|
||||
from .utils import (
|
||||
get_textline_coords,
|
||||
build_file_path_in_temp_dir,
|
||||
compute_accuracy,
|
||||
compute_whitespace,
|
||||
|
|
@ -98,11 +99,8 @@ class TextEdges(object):
|
|||
"""Returns the x coordinate of a text row based on the
|
||||
specified alignment.
|
||||
"""
|
||||
x_left = textline.x0
|
||||
x_right = textline.x1
|
||||
x_middle = x_left + (x_right - x_left) / 2.0
|
||||
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
|
||||
return x_coord[align]
|
||||
coords = get_textline_coords(textline)
|
||||
return coords[align]
|
||||
|
||||
def find(self, x_coord, align):
|
||||
"""Returns the index of an existing text edge using
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Implementation of hybrid table parser."""
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -7,6 +9,7 @@ import warnings
|
|||
|
||||
from .base import BaseParser
|
||||
from ..utils import (
|
||||
get_textline_coords,
|
||||
bbox_from_str,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis,
|
||||
|
|
@ -17,30 +20,26 @@ from ..utils import (
|
|||
|
||||
from matplotlib import patches as patches
|
||||
|
||||
# FRHTODO: Move to utils
|
||||
# maximum number of columns over which a header can spread
|
||||
MAX_COL_SPREAD_IN_HEADER = 3
|
||||
|
||||
|
||||
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||
max_v_gap):
|
||||
"""The core algorithm is based on fairly strict alignment of text.
|
||||
It works ok for the table body, but might fail on tables' headers
|
||||
since they tend to be in a different font, alignment (e.g. vertical),
|
||||
etc.
|
||||
The section below tries to identify whether what's above the bbox
|
||||
identified so far has the characteristics of a table header:
|
||||
Close to the top of the body, with cells that fit within the bounds
|
||||
identified.
|
||||
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||
"""Expand a bbox vertically up by looking for plausible headers.
|
||||
|
||||
The core algorithm is based on fairly strict alignment of text. It works
|
||||
for the table body, but might fail on tables' headers since they tend to be
|
||||
in a different font, alignment (e.g. vertical), etc.
|
||||
This method evalutes the area above the table body's bbox for
|
||||
characteristics of a table header: close to the top of the body, with cells
|
||||
that fit within the horizontal bounds identified.
|
||||
"""
|
||||
new_area = area
|
||||
(left, bottom, right, top) = area
|
||||
new_bbox = body_bbox
|
||||
(left, bottom, right, top) = body_bbox
|
||||
zones = []
|
||||
|
||||
def column_spread(left, right, col_anchors):
|
||||
"""Returns the number of columns (splits on the x-axis)
|
||||
crossed by an element covering left to right.
|
||||
"""
|
||||
"""Get the number of columns crossed by a segment [left, right]."""
|
||||
indexLeft = 0
|
||||
while indexLeft < len(col_anchors) \
|
||||
and col_anchors[indexLeft] < left:
|
||||
|
|
@ -55,7 +54,7 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
|||
keep_searching = True
|
||||
while keep_searching:
|
||||
keep_searching = False
|
||||
# a/ first look for the closest text element above the area.
|
||||
# a/ first look for the closest text element above the bbox.
|
||||
# It will be the anchor for a possible new row.
|
||||
closest_above = None
|
||||
all_above = []
|
||||
|
|
@ -128,18 +127,18 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
|||
# 1: <A1> <B1> <C1> <D1> <E1>
|
||||
# 2: <A2> <B2> <C2> <D2> <E2>
|
||||
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||
new_area = (left, bottom, right, top)
|
||||
new_bbox = (left, bottom, right, top)
|
||||
|
||||
# At this stage we've identified a plausible row (or the
|
||||
# beginning of one).
|
||||
keep_searching = True
|
||||
|
||||
return new_area
|
||||
return new_bbox
|
||||
|
||||
|
||||
class TextEdge2(object):
|
||||
"""Defines a text edge coordinates relative to a left-bottom
|
||||
origin. (PDF coordinate space)
|
||||
"""Text edge coordinates relative to a left-bottom origin.
|
||||
|
||||
(PDF coordinate space)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
@ -167,8 +166,7 @@ class TextEdge2(object):
|
|||
f"textlines text='{text_inside}...'>"
|
||||
|
||||
def register_aligned_textline(self, textline, coord):
|
||||
"""Updates new textline to this alignment, adapting its average.
|
||||
"""
|
||||
"""Update new textline to this alignment, adapting its average."""
|
||||
# Increase the intersections for this segment, expand it up,
|
||||
# and adjust the x based on the new value
|
||||
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||
|
|
@ -177,8 +175,13 @@ class TextEdge2(object):
|
|||
|
||||
|
||||
class Alignments(object):
|
||||
"""Represents the number of other textlines aligned with this
|
||||
one across each edge.
|
||||
"""
|
||||
Represent the number of textlines aligned with this one across each edge.
|
||||
|
||||
A cell can be vertically aligned with others by having matching left,
|
||||
right, or middle edge, and horizontally aligned by having matching top,
|
||||
bottom, or center edge.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
|
@ -261,20 +264,6 @@ class TextEdges2(object):
|
|||
self.max_rows = None
|
||||
self.max_cols = None
|
||||
|
||||
@staticmethod
|
||||
def get_textline_coords(textline):
|
||||
"""Calculate the coordinates of each alignment
|
||||
for a given textline.
|
||||
"""
|
||||
return {
|
||||
"left": textline.x0,
|
||||
"right": textline.x1,
|
||||
"middle": (textline.x0 + textline.x1) / 2.0,
|
||||
"bottom": textline.y0,
|
||||
"top": textline.y1,
|
||||
"center": (textline.y0 + textline.y1) / 2.0,
|
||||
}
|
||||
|
||||
# FRHTODO: Move to utils and use generic name
|
||||
@staticmethod
|
||||
def _get_index_closest_point(coord, edge_array):
|
||||
|
|
@ -328,7 +317,7 @@ class TextEdges2(object):
|
|||
def _register_textline(self, textline):
|
||||
"""Updates an existing text edge in the current dict.
|
||||
"""
|
||||
coords = TextEdges2.get_textline_coords(textline)
|
||||
coords = get_textline_coords(textline)
|
||||
for alignment in self._textedges:
|
||||
edge_array = self._textedges[alignment]
|
||||
coord = coords[alignment]
|
||||
|
|
@ -490,7 +479,7 @@ class TextEdges2(object):
|
|||
# It will serve as a reference axis along which to collect the average
|
||||
# spacing between rows/cols.
|
||||
most_aligned_tl = self._most_connected_textline()
|
||||
most_aligned_coords = TextEdges2.get_textline_coords(
|
||||
most_aligned_coords = get_textline_coords(
|
||||
most_aligned_tl)
|
||||
|
||||
# Retrieve the list of textlines it's aligned with, across both
|
||||
|
|
@ -871,7 +860,7 @@ class Hybrid(BaseParser):
|
|||
# FRHTODO: Check if needed, refactor with Stream
|
||||
@staticmethod
|
||||
def _add_columns(cols, text, row_tol):
|
||||
"""Adds columns to existing list by taking into account
|
||||
"""Add columns to existing list by taking into account
|
||||
the text that lies outside the current column x-coordinates.
|
||||
|
||||
Parameters
|
||||
|
|
@ -993,7 +982,7 @@ class Hybrid(BaseParser):
|
|||
|
||||
# Apply a heuristic to salvage headers which formatting might be
|
||||
# off compared to the rest of the table.
|
||||
expanded_bbox = todo_move_me_expand_area_for_header(
|
||||
expanded_bbox = search_header_from_body_bbox(
|
||||
bbox,
|
||||
textlines,
|
||||
cols_anchors,
|
||||
|
|
|
|||
|
|
@ -389,8 +389,21 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
|||
return v_s, h_s
|
||||
|
||||
|
||||
def get_textline_coords(textline):
|
||||
"""Calculate the coordinates of each alignment for a given textline.
|
||||
"""
|
||||
return {
|
||||
"left": textline.x0,
|
||||
"right": textline.x1,
|
||||
"middle": (textline.x0 + textline.x1) / 2.0,
|
||||
"bottom": textline.y0,
|
||||
"top": textline.y1,
|
||||
"center": (textline.y0 + textline.y1) / 2.0,
|
||||
}
|
||||
|
||||
|
||||
def bbox_from_str(bbox_str):
|
||||
"""Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2)
|
||||
"""Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
|
|
|||
Loading…
Reference in New Issue