Refactor common code hybrid / stream
parent
489e996bd8
commit
36d5a09ad6
|
|
@ -12,6 +12,7 @@ import pandas as pd
|
||||||
from cv2 import cv2
|
from cv2 import cv2
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
|
get_textline_coords,
|
||||||
build_file_path_in_temp_dir,
|
build_file_path_in_temp_dir,
|
||||||
compute_accuracy,
|
compute_accuracy,
|
||||||
compute_whitespace,
|
compute_whitespace,
|
||||||
|
|
@ -98,11 +99,8 @@ class TextEdges(object):
|
||||||
"""Returns the x coordinate of a text row based on the
|
"""Returns the x coordinate of a text row based on the
|
||||||
specified alignment.
|
specified alignment.
|
||||||
"""
|
"""
|
||||||
x_left = textline.x0
|
coords = get_textline_coords(textline)
|
||||||
x_right = textline.x1
|
return coords[align]
|
||||||
x_middle = x_left + (x_right - x_left) / 2.0
|
|
||||||
x_coord = {"left": x_left, "middle": x_middle, "right": x_right}
|
|
||||||
return x_coord[align]
|
|
||||||
|
|
||||||
def find(self, x_coord, align):
|
def find(self, x_coord, align):
|
||||||
"""Returns the index of an existing text edge using
|
"""Returns the index of an existing text edge using
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Implementation of hybrid table parser."""
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -7,6 +9,7 @@ import warnings
|
||||||
|
|
||||||
from .base import BaseParser
|
from .base import BaseParser
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
get_textline_coords,
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
text_in_bbox_per_axis,
|
text_in_bbox_per_axis,
|
||||||
|
|
@ -17,30 +20,26 @@ from ..utils import (
|
||||||
|
|
||||||
from matplotlib import patches as patches
|
from matplotlib import patches as patches
|
||||||
|
|
||||||
# FRHTODO: Move to utils
|
|
||||||
# maximum number of columns over which a header can spread
|
# maximum number of columns over which a header can spread
|
||||||
MAX_COL_SPREAD_IN_HEADER = 3
|
MAX_COL_SPREAD_IN_HEADER = 3
|
||||||
|
|
||||||
|
|
||||||
def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
max_v_gap):
|
"""Expand a bbox vertically up by looking for plausible headers.
|
||||||
"""The core algorithm is based on fairly strict alignment of text.
|
|
||||||
It works ok for the table body, but might fail on tables' headers
|
The core algorithm is based on fairly strict alignment of text. It works
|
||||||
since they tend to be in a different font, alignment (e.g. vertical),
|
for the table body, but might fail on tables' headers since they tend to be
|
||||||
etc.
|
in a different font, alignment (e.g. vertical), etc.
|
||||||
The section below tries to identify whether what's above the bbox
|
This method evalutes the area above the table body's bbox for
|
||||||
identified so far has the characteristics of a table header:
|
characteristics of a table header: close to the top of the body, with cells
|
||||||
Close to the top of the body, with cells that fit within the bounds
|
that fit within the horizontal bounds identified.
|
||||||
identified.
|
|
||||||
"""
|
"""
|
||||||
new_area = area
|
new_bbox = body_bbox
|
||||||
(left, bottom, right, top) = area
|
(left, bottom, right, top) = body_bbox
|
||||||
zones = []
|
zones = []
|
||||||
|
|
||||||
def column_spread(left, right, col_anchors):
|
def column_spread(left, right, col_anchors):
|
||||||
"""Returns the number of columns (splits on the x-axis)
|
"""Get the number of columns crossed by a segment [left, right]."""
|
||||||
crossed by an element covering left to right.
|
|
||||||
"""
|
|
||||||
indexLeft = 0
|
indexLeft = 0
|
||||||
while indexLeft < len(col_anchors) \
|
while indexLeft < len(col_anchors) \
|
||||||
and col_anchors[indexLeft] < left:
|
and col_anchors[indexLeft] < left:
|
||||||
|
|
@ -55,7 +54,7 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||||
keep_searching = True
|
keep_searching = True
|
||||||
while keep_searching:
|
while keep_searching:
|
||||||
keep_searching = False
|
keep_searching = False
|
||||||
# a/ first look for the closest text element above the area.
|
# a/ first look for the closest text element above the bbox.
|
||||||
# It will be the anchor for a possible new row.
|
# It will be the anchor for a possible new row.
|
||||||
closest_above = None
|
closest_above = None
|
||||||
all_above = []
|
all_above = []
|
||||||
|
|
@ -128,18 +127,18 @@ def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
|
||||||
# 1: <A1> <B1> <C1> <D1> <E1>
|
# 1: <A1> <B1> <C1> <D1> <E1>
|
||||||
# 2: <A2> <B2> <C2> <D2> <E2>
|
# 2: <A2> <B2> <C2> <D2> <E2>
|
||||||
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
# if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
|
||||||
new_area = (left, bottom, right, top)
|
new_bbox = (left, bottom, right, top)
|
||||||
|
|
||||||
# At this stage we've identified a plausible row (or the
|
# At this stage we've identified a plausible row (or the
|
||||||
# beginning of one).
|
# beginning of one).
|
||||||
keep_searching = True
|
keep_searching = True
|
||||||
|
return new_bbox
|
||||||
return new_area
|
|
||||||
|
|
||||||
|
|
||||||
class TextEdge2(object):
|
class TextEdge2(object):
|
||||||
"""Defines a text edge coordinates relative to a left-bottom
|
"""Text edge coordinates relative to a left-bottom origin.
|
||||||
origin. (PDF coordinate space)
|
|
||||||
|
(PDF coordinate space)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
@ -167,8 +166,7 @@ class TextEdge2(object):
|
||||||
f"textlines text='{text_inside}...'>"
|
f"textlines text='{text_inside}...'>"
|
||||||
|
|
||||||
def register_aligned_textline(self, textline, coord):
|
def register_aligned_textline(self, textline, coord):
|
||||||
"""Updates new textline to this alignment, adapting its average.
|
"""Update new textline to this alignment, adapting its average."""
|
||||||
"""
|
|
||||||
# Increase the intersections for this segment, expand it up,
|
# Increase the intersections for this segment, expand it up,
|
||||||
# and adjust the x based on the new value
|
# and adjust the x based on the new value
|
||||||
self.coord = (self.coord * len(self.textlines) + coord) / \
|
self.coord = (self.coord * len(self.textlines) + coord) / \
|
||||||
|
|
@ -177,8 +175,13 @@ class TextEdge2(object):
|
||||||
|
|
||||||
|
|
||||||
class Alignments(object):
|
class Alignments(object):
|
||||||
"""Represents the number of other textlines aligned with this
|
"""
|
||||||
one across each edge.
|
Represent the number of textlines aligned with this one across each edge.
|
||||||
|
|
||||||
|
A cell can be vertically aligned with others by having matching left,
|
||||||
|
right, or middle edge, and horizontally aligned by having matching top,
|
||||||
|
bottom, or center edge.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -261,20 +264,6 @@ class TextEdges2(object):
|
||||||
self.max_rows = None
|
self.max_rows = None
|
||||||
self.max_cols = None
|
self.max_cols = None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_textline_coords(textline):
|
|
||||||
"""Calculate the coordinates of each alignment
|
|
||||||
for a given textline.
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
"left": textline.x0,
|
|
||||||
"right": textline.x1,
|
|
||||||
"middle": (textline.x0 + textline.x1) / 2.0,
|
|
||||||
"bottom": textline.y0,
|
|
||||||
"top": textline.y1,
|
|
||||||
"center": (textline.y0 + textline.y1) / 2.0,
|
|
||||||
}
|
|
||||||
|
|
||||||
# FRHTODO: Move to utils and use generic name
|
# FRHTODO: Move to utils and use generic name
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_index_closest_point(coord, edge_array):
|
def _get_index_closest_point(coord, edge_array):
|
||||||
|
|
@ -328,7 +317,7 @@ class TextEdges2(object):
|
||||||
def _register_textline(self, textline):
|
def _register_textline(self, textline):
|
||||||
"""Updates an existing text edge in the current dict.
|
"""Updates an existing text edge in the current dict.
|
||||||
"""
|
"""
|
||||||
coords = TextEdges2.get_textline_coords(textline)
|
coords = get_textline_coords(textline)
|
||||||
for alignment in self._textedges:
|
for alignment in self._textedges:
|
||||||
edge_array = self._textedges[alignment]
|
edge_array = self._textedges[alignment]
|
||||||
coord = coords[alignment]
|
coord = coords[alignment]
|
||||||
|
|
@ -490,7 +479,7 @@ class TextEdges2(object):
|
||||||
# It will serve as a reference axis along which to collect the average
|
# It will serve as a reference axis along which to collect the average
|
||||||
# spacing between rows/cols.
|
# spacing between rows/cols.
|
||||||
most_aligned_tl = self._most_connected_textline()
|
most_aligned_tl = self._most_connected_textline()
|
||||||
most_aligned_coords = TextEdges2.get_textline_coords(
|
most_aligned_coords = get_textline_coords(
|
||||||
most_aligned_tl)
|
most_aligned_tl)
|
||||||
|
|
||||||
# Retrieve the list of textlines it's aligned with, across both
|
# Retrieve the list of textlines it's aligned with, across both
|
||||||
|
|
@ -871,7 +860,7 @@ class Hybrid(BaseParser):
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
# FRHTODO: Check if needed, refactor with Stream
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_columns(cols, text, row_tol):
|
def _add_columns(cols, text, row_tol):
|
||||||
"""Adds columns to existing list by taking into account
|
"""Add columns to existing list by taking into account
|
||||||
the text that lies outside the current column x-coordinates.
|
the text that lies outside the current column x-coordinates.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
|
@ -993,7 +982,7 @@ class Hybrid(BaseParser):
|
||||||
|
|
||||||
# Apply a heuristic to salvage headers which formatting might be
|
# Apply a heuristic to salvage headers which formatting might be
|
||||||
# off compared to the rest of the table.
|
# off compared to the rest of the table.
|
||||||
expanded_bbox = todo_move_me_expand_area_for_header(
|
expanded_bbox = search_header_from_body_bbox(
|
||||||
bbox,
|
bbox,
|
||||||
textlines,
|
textlines,
|
||||||
cols_anchors,
|
cols_anchors,
|
||||||
|
|
|
||||||
|
|
@ -389,8 +389,21 @@ def segments_in_bbox(bbox, v_segments, h_segments):
|
||||||
return v_s, h_s
|
return v_s, h_s
|
||||||
|
|
||||||
|
|
||||||
|
def get_textline_coords(textline):
|
||||||
|
"""Calculate the coordinates of each alignment for a given textline.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"left": textline.x0,
|
||||||
|
"right": textline.x1,
|
||||||
|
"middle": (textline.x0 + textline.x1) / 2.0,
|
||||||
|
"bottom": textline.y0,
|
||||||
|
"top": textline.y1,
|
||||||
|
"center": (textline.y0 + textline.y1) / 2.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def bbox_from_str(bbox_str):
|
def bbox_from_str(bbox_str):
|
||||||
"""Deserialize bbox from string form "x1,y1,x2,y2" to tuple (x1, y1, x2, y2)
|
"""Deserialize bbox from string ("x1,y1,x2,y2") to tuple (x1, y1, x2, y2).
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue