Further refactoring
parent
18581640be
commit
c9a73a1ad7
|
|
@ -15,8 +15,6 @@ from .utils import (
|
||||||
get_index_closest_point,
|
get_index_closest_point,
|
||||||
get_textline_coords,
|
get_textline_coords,
|
||||||
build_file_path_in_temp_dir,
|
build_file_path_in_temp_dir,
|
||||||
compute_accuracy,
|
|
||||||
compute_whitespace,
|
|
||||||
export_pdf_as_png
|
export_pdf_as_png
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -141,9 +139,9 @@ class TextAlignments(object):
|
||||||
|
|
||||||
def __init__(self, alignment_names):
|
def __init__(self, alignment_names):
|
||||||
# For each possible alignment, list of tuples coordinate/textlines
|
# For each possible alignment, list of tuples coordinate/textlines
|
||||||
self._textedges = {}
|
self._text_alignments = {}
|
||||||
for alignment_name in alignment_names:
|
for alignment_name in alignment_names:
|
||||||
self._textedges[alignment_name] = []
|
self._text_alignments[alignment_name] = []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_new_text_alignment(coord, textline, align):
|
def _create_new_text_alignment(coord, textline, align):
|
||||||
|
|
@ -156,12 +154,12 @@ class TextAlignments(object):
|
||||||
"""Updates an existing text edge in the current dict.
|
"""Updates an existing text edge in the current dict.
|
||||||
"""
|
"""
|
||||||
coords = get_textline_coords(textline)
|
coords = get_textline_coords(textline)
|
||||||
for alignment, edge_array in self._textedges.items():
|
for alignment_id, alignment_array in self._text_alignments.items():
|
||||||
coord = coords[alignment]
|
coord = coords[alignment_id]
|
||||||
|
|
||||||
# Find the index of the closest existing element (or 0 if none)
|
# Find the index of the closest existing element (or 0 if none)
|
||||||
idx_closest = get_index_closest_point(
|
idx_closest = get_index_closest_point(
|
||||||
coord, edge_array, fn=lambda x: x.coord
|
coord, alignment_array, fn=lambda x: x.coord
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if the edges before/after are close enough
|
# Check if the edges before/after are close enough
|
||||||
|
|
@ -169,17 +167,25 @@ class TextAlignments(object):
|
||||||
idx_insert = None
|
idx_insert = None
|
||||||
if idx_closest is None:
|
if idx_closest is None:
|
||||||
idx_insert = 0
|
idx_insert = 0
|
||||||
elif np.isclose(edge_array[idx_closest].coord, coord, atol=0.5):
|
elif np.isclose(
|
||||||
self._update_edge(edge_array[idx_closest], coord, textline)
|
alignment_array[idx_closest].coord,
|
||||||
elif edge_array[idx_closest].coord < coord:
|
coord,
|
||||||
|
atol=0.5
|
||||||
|
):
|
||||||
|
self._update_edge(
|
||||||
|
alignment_array[idx_closest],
|
||||||
|
coord,
|
||||||
|
textline
|
||||||
|
)
|
||||||
|
elif alignment_array[idx_closest].coord < coord:
|
||||||
idx_insert = idx_closest + 1
|
idx_insert = idx_closest + 1
|
||||||
else:
|
else:
|
||||||
idx_insert = idx_closest
|
idx_insert = idx_closest
|
||||||
if idx_insert is not None:
|
if idx_insert is not None:
|
||||||
new_edge = self._create_new_text_alignment(
|
new_alignment = self._create_new_text_alignment(
|
||||||
coord, textline, alignment
|
coord, textline, alignment_id
|
||||||
)
|
)
|
||||||
edge_array.insert(idx_insert, new_edge)
|
alignment_array.insert(idx_insert, new_alignment)
|
||||||
|
|
||||||
|
|
||||||
class TextEdges(TextAlignments):
|
class TextEdges(TextAlignments):
|
||||||
|
|
@ -201,7 +207,7 @@ class TextEdges(TextAlignments):
|
||||||
"""Adds a new text edge to the current dict.
|
"""Adds a new text edge to the current dict.
|
||||||
"""
|
"""
|
||||||
te = self._create_new_text_alignment(coord, textline, align)
|
te = self._create_new_text_alignment(coord, textline, align)
|
||||||
self._textedges[align].append(te)
|
self._text_alignments[align].append(te)
|
||||||
|
|
||||||
def _update_edge(self, edge, coord, textline):
|
def _update_edge(self, edge, coord, textline):
|
||||||
edge.update_coords(coord, textline, self.edge_tol)
|
edge.update_coords(coord, textline, self.edge_tol)
|
||||||
|
|
@ -221,15 +227,15 @@ class TextEdges(TextAlignments):
|
||||||
"""
|
"""
|
||||||
intersections_sum = {
|
intersections_sum = {
|
||||||
"left": sum(
|
"left": sum(
|
||||||
len(te.textlines) for te in self._textedges["left"]
|
len(te.textlines) for te in self._text_alignments["left"]
|
||||||
if te.is_valid
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"right": sum(
|
"right": sum(
|
||||||
len(te.textlines) for te in self._textedges["right"]
|
len(te.textlines) for te in self._text_alignments["right"]
|
||||||
if te.is_valid
|
if te.is_valid
|
||||||
),
|
),
|
||||||
"middle": sum(
|
"middle": sum(
|
||||||
len(te.textlines) for te in self._textedges["middle"]
|
len(te.textlines) for te in self._text_alignments["middle"]
|
||||||
if te.is_valid
|
if te.is_valid
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
@ -240,7 +246,7 @@ class TextEdges(TextAlignments):
|
||||||
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
|
||||||
return list(filter(
|
return list(filter(
|
||||||
lambda te: te.is_valid,
|
lambda te: te.is_valid,
|
||||||
self._textedges[relevant_align])
|
self._text_alignments[relevant_align])
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_table_areas(self, textlines, relevant_textedges):
|
def get_table_areas(self, textlines, relevant_textedges):
|
||||||
|
|
@ -443,9 +449,9 @@ class Table(object):
|
||||||
self.filename = None
|
self.filename = None
|
||||||
self.order = None
|
self.order = None
|
||||||
self.page = None
|
self.page = None
|
||||||
self.flavor = None # Flavor of the parser that generated the table
|
self.flavor = None # Flavor of the parser used
|
||||||
self.pdf_size = None # Dimensions of the original PDF page
|
self.pdf_size = None # Dimensions of the original PDF page
|
||||||
self.debug_info = None # Field holding debug data
|
self.parse_details = None # Field holding debug data
|
||||||
|
|
||||||
self._image = None
|
self._image = None
|
||||||
self._image_path = None # Temporary file to hold an image of the pdf
|
self._image_path = None # Temporary file to hold an image of the pdf
|
||||||
|
|
@ -485,31 +491,6 @@ class Table(object):
|
||||||
}
|
}
|
||||||
return report
|
return report
|
||||||
|
|
||||||
def record_parse_metadata(self, parser):
|
|
||||||
"""Record data about the origin of the table
|
|
||||||
"""
|
|
||||||
self.flavor = parser.id
|
|
||||||
self.filename = parser.filename
|
|
||||||
self.debug_info = parser.debug_info
|
|
||||||
pos_errors = parser.compute_parse_errors(self)
|
|
||||||
self.accuracy = compute_accuracy([[100, pos_errors]])
|
|
||||||
|
|
||||||
if parser.copy_text is not None:
|
|
||||||
self.copy_spanning_text(parser.copy_text)
|
|
||||||
|
|
||||||
data = self.data
|
|
||||||
self.df = pd.DataFrame(data)
|
|
||||||
self.shape = self.df.shape
|
|
||||||
|
|
||||||
self.whitespace = compute_whitespace(data)
|
|
||||||
self.pdf_size = (parser.pdf_width, parser.pdf_height)
|
|
||||||
|
|
||||||
_text = []
|
|
||||||
_text.extend(
|
|
||||||
[(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
|
|
||||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
|
|
||||||
self._text = _text
|
|
||||||
|
|
||||||
def get_pdf_image(self):
|
def get_pdf_image(self):
|
||||||
"""Compute pdf image and cache it
|
"""Compute pdf image and cache it
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,18 @@
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
bbox_from_str,
|
||||||
|
bbox_from_textlines,
|
||||||
|
compute_accuracy,
|
||||||
|
compute_whitespace,
|
||||||
get_text_objects,
|
get_text_objects,
|
||||||
get_table_index,
|
get_table_index,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
bbox_from_str,
|
text_in_bbox_per_axis,
|
||||||
)
|
)
|
||||||
from ..core import Table
|
from ..core import Table
|
||||||
|
|
||||||
|
|
@ -42,7 +49,7 @@ class BaseParser(object):
|
||||||
self.t_bbox = None
|
self.t_bbox = None
|
||||||
|
|
||||||
# For plotting details of parsing algorithms
|
# For plotting details of parsing algorithms
|
||||||
self.debug_info = {} if debug else None
|
self.parse_details = {} if debug else None
|
||||||
|
|
||||||
def prepare_page_parse(self, filename, layout, dimensions,
|
def prepare_page_parse(self, filename, layout, dimensions,
|
||||||
page_idx, layout_kwargs):
|
page_idx, layout_kwargs):
|
||||||
|
|
@ -63,9 +70,9 @@ class BaseParser(object):
|
||||||
self.pdf_width, self.pdf_height = self.dimensions
|
self.pdf_width, self.pdf_height = self.dimensions
|
||||||
self.rootname, __ = os.path.splitext(self.filename)
|
self.rootname, __ = os.path.splitext(self.filename)
|
||||||
|
|
||||||
if self.debug_info is not None:
|
if self.parse_details is not None:
|
||||||
self.debug_info["table_regions"] = self.table_regions
|
self.parse_details["table_regions"] = self.table_regions
|
||||||
self.debug_info["table_areas"] = self.table_areas
|
self.parse_details["table_areas"] = self.table_areas
|
||||||
|
|
||||||
def _apply_regions_filter(self, textlines):
|
def _apply_regions_filter(self, textlines):
|
||||||
"""If regions have been specified, filter textlines to these regions.
|
"""If regions have been specified, filter textlines to these regions.
|
||||||
|
|
@ -194,6 +201,31 @@ class BaseParser(object):
|
||||||
|
|
||||||
return _tables
|
return _tables
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
table.flavor = self.id
|
||||||
|
table.filename = self.filename
|
||||||
|
table.parse_details = self.parse_details
|
||||||
|
pos_errors = self.compute_parse_errors(table)
|
||||||
|
table.accuracy = compute_accuracy([[100, pos_errors]])
|
||||||
|
|
||||||
|
if self.copy_text is not None:
|
||||||
|
table.copy_spanning_text(self.copy_text)
|
||||||
|
|
||||||
|
data = table.data
|
||||||
|
table.df = pd.DataFrame(data)
|
||||||
|
table.shape = table.df.shape
|
||||||
|
|
||||||
|
table.whitespace = compute_whitespace(data)
|
||||||
|
table.pdf_size = (self.pdf_width, self.pdf_height)
|
||||||
|
|
||||||
|
_text = []
|
||||||
|
_text.extend(
|
||||||
|
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||||
|
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||||
|
table._text = _text
|
||||||
|
|
||||||
|
|
||||||
class TextBaseParser(BaseParser):
|
class TextBaseParser(BaseParser):
|
||||||
"""Base class for all text parsers.
|
"""Base class for all text parsers.
|
||||||
|
|
@ -211,15 +243,17 @@ class TextBaseParser(BaseParser):
|
||||||
edge_tol=50,
|
edge_tol=50,
|
||||||
row_tol=2,
|
row_tol=2,
|
||||||
column_tol=0,
|
column_tol=0,
|
||||||
|
debug=False,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
"stream",
|
parser_id,
|
||||||
table_regions=table_regions,
|
table_regions=table_regions,
|
||||||
table_areas=table_areas,
|
table_areas=table_areas,
|
||||||
split_text=split_text,
|
split_text=split_text,
|
||||||
strip_text=strip_text,
|
strip_text=strip_text,
|
||||||
flag_size=flag_size,
|
flag_size=flag_size,
|
||||||
|
debug=debug,
|
||||||
)
|
)
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self._validate_columns()
|
self._validate_columns()
|
||||||
|
|
@ -227,4 +261,271 @@ class TextBaseParser(BaseParser):
|
||||||
self.row_tol = row_tol
|
self.row_tol = row_tol
|
||||||
self.column_tol = column_tol
|
self.column_tol = column_tol
|
||||||
|
|
||||||
self.textedges = None
|
@staticmethod
|
||||||
|
def _group_rows(text, row_tol=2):
|
||||||
|
"""Groups PDFMiner text objects into rows vertically
|
||||||
|
within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
row_tol : int, optional (default: 2)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
|
||||||
|
"""
|
||||||
|
row_y = None
|
||||||
|
rows = []
|
||||||
|
temp = []
|
||||||
|
non_empty_text = [t for t in text if t.get_text().strip()]
|
||||||
|
for t in non_empty_text:
|
||||||
|
# is checking for upright necessary?
|
||||||
|
# if t.get_text().strip() and all([obj.upright \
|
||||||
|
# for obj in t._objs
|
||||||
|
# if type(obj) is LTChar]):
|
||||||
|
if row_y is None:
|
||||||
|
row_y = t.y0
|
||||||
|
elif not np.isclose(row_y, t.y0, atol=row_tol):
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
temp = []
|
||||||
|
# We update the row's bottom as we go, to be forgiving if there
|
||||||
|
# is a gradual change across multiple columns.
|
||||||
|
row_y = t.y0
|
||||||
|
temp.append(t)
|
||||||
|
rows.append(sorted(temp, key=lambda t: t.x0))
|
||||||
|
return rows
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _merge_columns(l, column_tol=0):
|
||||||
|
"""Merges column boundaries horizontally if they overlap
|
||||||
|
or lie within a tolerance.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
l : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
column_tol : int, optional (default: 0)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
List of merged column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
merged = []
|
||||||
|
for higher in l:
|
||||||
|
if not merged:
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
lower = merged[-1]
|
||||||
|
if column_tol >= 0:
|
||||||
|
if higher[0] <= lower[1] or np.isclose(
|
||||||
|
higher[0], lower[1], atol=column_tol
|
||||||
|
):
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
elif column_tol < 0:
|
||||||
|
if higher[0] <= lower[1]:
|
||||||
|
if np.isclose(higher[0], lower[1],
|
||||||
|
atol=abs(column_tol)):
|
||||||
|
merged.append(higher)
|
||||||
|
else:
|
||||||
|
upper_bound = max(lower[1], higher[1])
|
||||||
|
lower_bound = min(lower[0], higher[0])
|
||||||
|
merged[-1] = (lower_bound, upper_bound)
|
||||||
|
else:
|
||||||
|
merged.append(higher)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
||||||
|
"""Makes row coordinates continuous. For the row to "touch"
|
||||||
|
we split the existing gap between them in half.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
rows_grouped : list
|
||||||
|
Two-dimensional list of text objects grouped into rows.
|
||||||
|
text_y_max : int
|
||||||
|
text_y_min : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
rows : list
|
||||||
|
List of continuous row y-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
row_boundaries = [
|
||||||
|
[
|
||||||
|
max(t.y1 for t in r),
|
||||||
|
min(t.y0 for t in r)
|
||||||
|
]
|
||||||
|
for r in rows_grouped
|
||||||
|
]
|
||||||
|
for i in range(0, len(row_boundaries)-1):
|
||||||
|
top_row = row_boundaries[i]
|
||||||
|
bottom_row = row_boundaries[i+1]
|
||||||
|
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
||||||
|
row_boundaries[0][0] = text_y_max
|
||||||
|
row_boundaries[-1][1] = text_y_min
|
||||||
|
return row_boundaries
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_columns(cols, text, row_tol):
|
||||||
|
"""Adds columns to existing list by taking into account
|
||||||
|
the text that lies outside the current column x-coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text : list
|
||||||
|
List of PDFMiner text objects.
|
||||||
|
ytol : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if text:
|
||||||
|
text = TextBaseParser._group_rows(text, row_tol=row_tol)
|
||||||
|
elements = [len(r) for r in text]
|
||||||
|
new_cols = [
|
||||||
|
(t.x0, t.x1)
|
||||||
|
for r in text if len(r) == max(elements)
|
||||||
|
for t in r
|
||||||
|
]
|
||||||
|
cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
|
||||||
|
return cols
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_columns(cols, text_x_min, text_x_max):
|
||||||
|
"""Makes column coordinates continuous.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cols : list
|
||||||
|
List of column x-coordinate tuples.
|
||||||
|
text_x_min : int
|
||||||
|
text_y_max : int
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
cols : list
|
||||||
|
Updated list of column x-coordinate tuples.
|
||||||
|
|
||||||
|
"""
|
||||||
|
cols = sorted(cols)
|
||||||
|
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
||||||
|
cols.insert(0, text_x_min)
|
||||||
|
cols.append(text_x_max)
|
||||||
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def _validate_columns(self):
|
||||||
|
if self.table_areas is not None and self.columns is not None:
|
||||||
|
if len(self.table_areas) != len(self.columns):
|
||||||
|
raise ValueError("Length of table_areas and columns"
|
||||||
|
" should be equal")
|
||||||
|
|
||||||
|
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||||
|
# select elements which lie within table_bbox
|
||||||
|
self.t_bbox = text_in_bbox_per_axis(
|
||||||
|
bbox,
|
||||||
|
self.horizontal_text,
|
||||||
|
self.vertical_text
|
||||||
|
)
|
||||||
|
|
||||||
|
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||||
|
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||||
|
)
|
||||||
|
rows_grouped = self._group_rows(
|
||||||
|
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||||
|
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||||
|
elements = [len(r) for r in rows_grouped]
|
||||||
|
|
||||||
|
if self.columns is not None and self.columns[table_idx] != "":
|
||||||
|
# user has to input boundary columns too
|
||||||
|
# take (0, pdf_width) by default
|
||||||
|
# similar to else condition
|
||||||
|
# len can't be 1
|
||||||
|
cols = self.columns[table_idx].split(",")
|
||||||
|
cols = [float(c) for c in cols]
|
||||||
|
cols.insert(0, text_x_min)
|
||||||
|
cols.append(text_x_max)
|
||||||
|
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||||
|
else:
|
||||||
|
# calculate mode of the list of number of elements in
|
||||||
|
# each row to guess the number of columns
|
||||||
|
ncols = max(set(elements), key=elements.count)
|
||||||
|
if ncols == 1:
|
||||||
|
# if mode is 1, the page usually contains not tables
|
||||||
|
# but there can be cases where the list can be skewed,
|
||||||
|
# try to remove all 1s from list in this case and
|
||||||
|
# see if the list contains elements, if yes, then use
|
||||||
|
# the mode after removing 1s
|
||||||
|
elements = list(filter(lambda x: x != 1, elements))
|
||||||
|
if elements:
|
||||||
|
ncols = max(set(elements), key=elements.count)
|
||||||
|
else:
|
||||||
|
warnings.warn(
|
||||||
|
"No tables found in table area {}"
|
||||||
|
.format(table_idx + 1)
|
||||||
|
)
|
||||||
|
cols = [
|
||||||
|
(t.x0, t.x1)
|
||||||
|
for r in rows_grouped
|
||||||
|
if len(r) == ncols
|
||||||
|
for t in r
|
||||||
|
]
|
||||||
|
cols = self._merge_columns(
|
||||||
|
sorted(cols),
|
||||||
|
column_tol=self.column_tol
|
||||||
|
)
|
||||||
|
inner_text = []
|
||||||
|
for i in range(1, len(cols)):
|
||||||
|
left = cols[i - 1][1]
|
||||||
|
right = cols[i][0]
|
||||||
|
inner_text.extend(
|
||||||
|
[
|
||||||
|
t
|
||||||
|
for direction in self.t_bbox
|
||||||
|
for t in self.t_bbox[direction]
|
||||||
|
if t.x0 > left and t.x1 < right
|
||||||
|
]
|
||||||
|
)
|
||||||
|
outer_text = [
|
||||||
|
t
|
||||||
|
for direction in self.t_bbox
|
||||||
|
for t in self.t_bbox[direction]
|
||||||
|
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||||
|
]
|
||||||
|
inner_text.extend(outer_text)
|
||||||
|
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||||
|
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||||
|
|
||||||
|
return cols, rows, None, None
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
# for plotting
|
||||||
|
table._bbox = self.table_bbox
|
||||||
|
table._segments = None
|
||||||
|
|
||||||
|
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
||||||
|
table = self._initialize_new_table(table_idx, cols, rows)
|
||||||
|
table = table.set_all_edges()
|
||||||
|
self.record_parse_metadata(table)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ from __future__ import division
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import copy
|
import copy
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .base import TextBaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import (
|
from ..core import (
|
||||||
|
|
@ -17,7 +16,6 @@ from ..core import (
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
text_in_bbox,
|
text_in_bbox,
|
||||||
text_in_bbox_per_axis,
|
|
||||||
bbox_from_textlines,
|
bbox_from_textlines,
|
||||||
distance_tl_to_bbox,
|
distance_tl_to_bbox,
|
||||||
find_columns_coordinates
|
find_columns_coordinates
|
||||||
|
|
@ -142,11 +140,11 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
|
||||||
|
|
||||||
class AlignmentCounter(object):
|
class AlignmentCounter(object):
|
||||||
"""
|
"""
|
||||||
Represents all textlines aligned with a textline for each alignment.
|
For a given textline, represent all other textlines aligned with it.
|
||||||
|
|
||||||
A textline can be vertically aligned with others by having matching left,
|
A textline can be vertically aligned with others if their bbox match on
|
||||||
right, or middle edge, and horizontally aligned by having matching top,
|
left, right, or middle coord, and horizontally aligned if they match top,
|
||||||
bottom, or center edge.
|
bottom, or center coord.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -210,15 +208,15 @@ class AlignmentCounter(object):
|
||||||
|
|
||||||
|
|
||||||
class TextNetworks(TextAlignments):
|
class TextNetworks(TextAlignments):
|
||||||
"""Text elements connected via both vertical (top, bottom, middle) and
|
"""Text elements connected by vertical AND horizontal alignments.
|
||||||
horizontal (left, right, and middle) alignments found on the PDF page.
|
|
||||||
The alignment dict has six keys based on the hor/vert alignments,
|
The alignment dict has six keys based on the hor/vert alignments,
|
||||||
and each key's value is a list of camelot.core.TextAlignment objects.
|
and each key's value is a list of camelot.core.TextAlignment objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__(ALL_ALIGNMENTS)
|
super().__init__(ALL_ALIGNMENTS)
|
||||||
# For each textline, dictionary "edge type" to
|
# For each textline, dictionary "alignment type" to
|
||||||
# "number of textlines aligned"
|
# "number of textlines aligned"
|
||||||
self._textlines_alignments = {}
|
self._textlines_alignments = {}
|
||||||
|
|
||||||
|
|
@ -226,10 +224,10 @@ class TextNetworks(TextAlignments):
|
||||||
edge.register_aligned_textline(textline, coord)
|
edge.register_aligned_textline(textline, coord)
|
||||||
|
|
||||||
def _register_all_text_lines(self, textlines):
|
def _register_all_text_lines(self, textlines):
|
||||||
"""Add all textlines to our edge repository to
|
"""Add all textlines to our network repository to
|
||||||
identify alignments.
|
identify alignments.
|
||||||
"""
|
"""
|
||||||
# Identify all the edge alignments
|
# Identify all the alignments
|
||||||
for tl in textlines:
|
for tl in textlines:
|
||||||
if len(tl.get_text().strip()) > 0:
|
if len(tl.get_text().strip()) > 0:
|
||||||
self._register_textline(tl)
|
self._register_textline(tl)
|
||||||
|
|
@ -237,7 +235,7 @@ class TextNetworks(TextAlignments):
|
||||||
def _compute_alignment_counts(self):
|
def _compute_alignment_counts(self):
|
||||||
"""Build a dictionary textline -> alignment object.
|
"""Build a dictionary textline -> alignment object.
|
||||||
"""
|
"""
|
||||||
for align_id, textedges in self._textedges.items():
|
for align_id, textedges in self._text_alignments.items():
|
||||||
for textedge in textedges:
|
for textedge in textedges:
|
||||||
for textline in textedge.textlines:
|
for textline in textedge.textlines:
|
||||||
alignments = self._textlines_alignments.get(
|
alignments = self._textlines_alignments.get(
|
||||||
|
|
@ -254,8 +252,8 @@ class TextNetworks(TextAlignments):
|
||||||
the core table.
|
the core table.
|
||||||
"""
|
"""
|
||||||
h_gaps, v_gaps = [], []
|
h_gaps, v_gaps = [], []
|
||||||
for align_id in self._textedges:
|
for align_id in self._text_alignments:
|
||||||
edge_array = self._textedges[align_id]
|
edge_array = self._text_alignments[align_id]
|
||||||
gaps = []
|
gaps = []
|
||||||
vertical = align_id in HORIZONTAL_ALIGNMENTS
|
vertical = align_id in HORIZONTAL_ALIGNMENTS
|
||||||
sort_function = (lambda tl: tl.y0) \
|
sort_function = (lambda tl: tl.y0) \
|
||||||
|
|
@ -299,7 +297,7 @@ class TextNetworks(TextAlignments):
|
||||||
removed_singletons = True
|
removed_singletons = True
|
||||||
while removed_singletons:
|
while removed_singletons:
|
||||||
removed_singletons = False
|
removed_singletons = False
|
||||||
for alignment_id, textalignments in self._textedges.items():
|
for alignment_id, textalignments in self._text_alignments.items():
|
||||||
# For each alignment edge, remove items if they are singletons
|
# For each alignment edge, remove items if they are singletons
|
||||||
# either horizontally or vertically
|
# either horizontally or vertically
|
||||||
for ta in textalignments:
|
for ta in textalignments:
|
||||||
|
|
@ -313,7 +311,7 @@ class TextNetworks(TextAlignments):
|
||||||
self._textlines_alignments = {}
|
self._textlines_alignments = {}
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
def _most_connected_textline(self):
|
def most_connected_textline(self):
|
||||||
""" Retrieve the textline that is most connected across vertical and
|
""" Retrieve the textline that is most connected across vertical and
|
||||||
horizontal axis.
|
horizontal axis.
|
||||||
|
|
||||||
|
|
@ -340,7 +338,7 @@ class TextNetworks(TextAlignments):
|
||||||
# alignments across horizontal and vertical axis.
|
# alignments across horizontal and vertical axis.
|
||||||
# It will serve as a reference axis along which to collect the average
|
# It will serve as a reference axis along which to collect the average
|
||||||
# spacing between rows/cols.
|
# spacing between rows/cols.
|
||||||
most_aligned_tl = self._most_connected_textline()
|
most_aligned_tl = self.most_connected_textline()
|
||||||
if most_aligned_tl is None:
|
if most_aligned_tl is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -378,7 +376,7 @@ class TextNetworks(TextAlignments):
|
||||||
)
|
)
|
||||||
return gaps_hv
|
return gaps_hv
|
||||||
|
|
||||||
def _build_bbox_candidate(self, gaps_hv, debug_info=None):
|
def _build_bbox_candidate(self, gaps_hv, parse_details=None):
|
||||||
""" Seed the process with the textline with the highest alignment
|
""" Seed the process with the textline with the highest alignment
|
||||||
score, then expand the bbox with textlines within threshold.
|
score, then expand the bbox with textlines within threshold.
|
||||||
|
|
||||||
|
|
@ -387,7 +385,7 @@ class TextNetworks(TextAlignments):
|
||||||
gaps_hv : tuple
|
gaps_hv : tuple
|
||||||
The maximum distance allowed to consider surrounding lines/columns
|
The maximum distance allowed to consider surrounding lines/columns
|
||||||
as part of the same table.
|
as part of the same table.
|
||||||
debug_info : array (optional)
|
parse_details : array (optional)
|
||||||
Optional parameter array, in which to store extra information
|
Optional parameter array, in which to store extra information
|
||||||
to help later visualization of the table creation.
|
to help later visualization of the table creation.
|
||||||
"""
|
"""
|
||||||
|
|
@ -396,23 +394,23 @@ class TextNetworks(TextAlignments):
|
||||||
# It will serve both as a starting point for the table boundary
|
# It will serve both as a starting point for the table boundary
|
||||||
# search, and as a way to estimate the average spacing between
|
# search, and as a way to estimate the average spacing between
|
||||||
# rows/cols.
|
# rows/cols.
|
||||||
most_aligned_tl = self._most_connected_textline()
|
most_aligned_tl = self.most_connected_textline()
|
||||||
|
|
||||||
# Calculate the 75th percentile of the horizontal/vertical
|
# Calculate the 75th percentile of the horizontal/vertical
|
||||||
# gaps between textlines. Use this as a reference for a threshold
|
# gaps between textlines. Use this as a reference for a threshold
|
||||||
# to not exceed while looking for table boundaries.
|
# to not exceed while looking for table boundaries.
|
||||||
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
|
max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
|
||||||
|
|
||||||
if debug_info is not None:
|
if parse_details is not None:
|
||||||
# Store debug info
|
# Store debug info
|
||||||
debug_info_search = {
|
parse_details_search = {
|
||||||
"max_h_gap": max_h_gap,
|
"max_h_gap": max_h_gap,
|
||||||
"max_v_gap": max_v_gap,
|
"max_v_gap": max_v_gap,
|
||||||
"iterations": []
|
"iterations": []
|
||||||
}
|
}
|
||||||
debug_info.append(debug_info_search)
|
parse_details.append(parse_details_search)
|
||||||
else:
|
else:
|
||||||
debug_info_search = None
|
parse_details_search = None
|
||||||
|
|
||||||
MINIMUM_TEXTLINES_IN_TABLE = 6
|
MINIMUM_TEXTLINES_IN_TABLE = 6
|
||||||
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
|
||||||
|
|
@ -426,9 +424,9 @@ class TextNetworks(TextAlignments):
|
||||||
tls_in_bbox = [most_aligned_tl]
|
tls_in_bbox = [most_aligned_tl]
|
||||||
last_bbox = None
|
last_bbox = None
|
||||||
while last_bbox != bbox:
|
while last_bbox != bbox:
|
||||||
if debug_info_search is not None:
|
if parse_details_search is not None:
|
||||||
# Store debug info
|
# Store debug info
|
||||||
debug_info_search["iterations"].append(bbox)
|
parse_details_search["iterations"].append(bbox)
|
||||||
|
|
||||||
last_bbox = bbox
|
last_bbox = bbox
|
||||||
# Go through all remaining textlines, expand our bbox
|
# Go through all remaining textlines, expand our bbox
|
||||||
|
|
@ -461,35 +459,6 @@ class TextNetworks(TextAlignments):
|
||||||
self._register_all_text_lines(textlines)
|
self._register_all_text_lines(textlines)
|
||||||
self._compute_alignment_counts()
|
self._compute_alignment_counts()
|
||||||
|
|
||||||
def plot_alignments(self, ax):
|
|
||||||
"""Displays a visualization of the alignments as currently computed.
|
|
||||||
"""
|
|
||||||
# FRHTODO: This is too busy and doesn't plot lines
|
|
||||||
most_aligned_tl = sorted(
|
|
||||||
self._textlines_alignments.keys(),
|
|
||||||
key=lambda textline:
|
|
||||||
self._textlines_alignments[textline].alignment_score(),
|
|
||||||
reverse=True
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
ax.add_patch(
|
|
||||||
patches.Rectangle(
|
|
||||||
(most_aligned_tl.x0, most_aligned_tl.y0),
|
|
||||||
most_aligned_tl.x1 - most_aligned_tl.x0,
|
|
||||||
most_aligned_tl.y1 - most_aligned_tl.y0,
|
|
||||||
color="red",
|
|
||||||
alpha=0.5
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for tl, alignments in self._textlines_alignments.items():
|
|
||||||
ax.text(
|
|
||||||
tl.x0 - 5,
|
|
||||||
tl.y0 - 5,
|
|
||||||
f"{alignments.max_h_count()}x{alignments.max_v_count()}",
|
|
||||||
fontsize=5,
|
|
||||||
color="black"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Hybrid(TextBaseParser):
|
class Hybrid(TextBaseParser):
|
||||||
"""Hybrid method of parsing looks for spaces between text
|
"""Hybrid method of parsing looks for spaces between text
|
||||||
|
|
@ -555,190 +524,9 @@ class Hybrid(TextBaseParser):
|
||||||
edge_tol=edge_tol,
|
edge_tol=edge_tol,
|
||||||
row_tol=row_tol,
|
row_tol=row_tol,
|
||||||
column_tol=column_tol,
|
column_tol=column_tol,
|
||||||
|
debug=debug,
|
||||||
)
|
)
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _group_rows(text, row_tol=2):
|
|
||||||
"""Groups PDFMiner text objects into rows vertically
|
|
||||||
within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
row_tol : int, optional (default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_y = None
|
|
||||||
rows = []
|
|
||||||
temp = []
|
|
||||||
non_empty_text = [t for t in text if t.get_text().strip()]
|
|
||||||
for t in non_empty_text:
|
|
||||||
# is checking for upright necessary?
|
|
||||||
# if t.get_text().strip() and all([obj.upright \
|
|
||||||
# for obj in t._objs
|
|
||||||
# if type(obj) is LTChar]):
|
|
||||||
if row_y is None:
|
|
||||||
row_y = t.y0
|
|
||||||
elif not np.isclose(row_y, t.y0, atol=row_tol):
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
temp = []
|
|
||||||
# We update the row's bottom as we go, to be forgiving if there
|
|
||||||
# is a gradual change across multiple columns.
|
|
||||||
row_y = t.y0
|
|
||||||
temp.append(t)
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
return rows
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _merge_columns(l, column_tol=0):
|
|
||||||
"""Merges column boundaries horizontally if they overlap
|
|
||||||
or lie within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
l : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
column_tol : int, optional (default: 0)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
merged : list
|
|
||||||
List of merged column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
merged = []
|
|
||||||
for higher in l:
|
|
||||||
if not merged:
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
lower = merged[-1]
|
|
||||||
if column_tol >= 0:
|
|
||||||
if higher[0] <= lower[1] or np.isclose(
|
|
||||||
higher[0], lower[1], atol=column_tol
|
|
||||||
):
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
elif column_tol < 0:
|
|
||||||
if higher[0] <= lower[1]:
|
|
||||||
if np.isclose(higher[0], lower[1],
|
|
||||||
atol=abs(column_tol)):
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
return merged
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
|
||||||
"""Makes row coordinates continuous. For the row to "touch"
|
|
||||||
we split the existing gap between them in half.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
rows_grouped : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
text_y_max : int
|
|
||||||
text_y_min : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
List of continuous row y-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_boundaries = [
|
|
||||||
[
|
|
||||||
max(t.y1 for t in r),
|
|
||||||
min(t.y0 for t in r)
|
|
||||||
]
|
|
||||||
for r in rows_grouped
|
|
||||||
]
|
|
||||||
for i in range(0, len(row_boundaries)-1):
|
|
||||||
top_row = row_boundaries[i]
|
|
||||||
bottom_row = row_boundaries[i+1]
|
|
||||||
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
|
||||||
row_boundaries[0][0] = text_y_max
|
|
||||||
row_boundaries[-1][1] = text_y_min
|
|
||||||
return row_boundaries
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _add_columns(cols, text, row_tol):
|
|
||||||
"""Add columns to existing list by taking into account
|
|
||||||
the text that lies outside the current column x-coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
ytol : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if text:
|
|
||||||
text = Hybrid._group_rows(text, row_tol=row_tol)
|
|
||||||
elements = [len(r) for r in text]
|
|
||||||
new_cols = [
|
|
||||||
(t.x0, t.x1)
|
|
||||||
for r in text if len(r) == max(elements)
|
|
||||||
for t in r
|
|
||||||
]
|
|
||||||
cols.extend(Hybrid._merge_columns(sorted(new_cols)))
|
|
||||||
return cols
|
|
||||||
|
|
||||||
# FRHTODO: Check if needed, refactor with Stream
|
|
||||||
@staticmethod
|
|
||||||
def _join_columns(cols, text_x_min, text_x_max):
|
|
||||||
"""Makes column coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text_x_min : int
|
|
||||||
text_y_max : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
cols = sorted(cols)
|
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
return cols
|
|
||||||
|
|
||||||
# FRHTODO: Check is needed, refactor with Stream
|
|
||||||
def _validate_columns(self):
|
|
||||||
if self.table_areas is not None and self.columns is not None:
|
|
||||||
if len(self.table_areas) != len(self.columns):
|
|
||||||
raise ValueError("Length of table_areas and columns"
|
|
||||||
" should be equal")
|
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
if self.table_areas is not None:
|
if self.table_areas is not None:
|
||||||
table_bbox = {}
|
table_bbox = {}
|
||||||
|
|
@ -756,25 +544,21 @@ class Hybrid(TextBaseParser):
|
||||||
|
|
||||||
textlines_processed = {}
|
textlines_processed = {}
|
||||||
self.table_bbox = {}
|
self.table_bbox = {}
|
||||||
if self.debug_info is not None:
|
if self.parse_details is not None:
|
||||||
debug_info_edges_searches = []
|
parse_details_network_searches = []
|
||||||
self.debug_info["edges_searches"] = debug_info_edges_searches
|
self.parse_details["network_searches"] = \
|
||||||
debug_info_bboxes_searches = []
|
parse_details_network_searches
|
||||||
self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
|
parse_details_bbox_searches = []
|
||||||
|
self.parse_details["bbox_searches"] = parse_details_bbox_searches
|
||||||
else:
|
else:
|
||||||
debug_info_edges_searches = None
|
parse_details_network_searches = None
|
||||||
debug_info_bboxes_searches = None
|
parse_details_bbox_searches = None
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
self.textedges = TextNetworks()
|
text_network = TextNetworks()
|
||||||
self.textedges.generate(textlines)
|
text_network.generate(textlines)
|
||||||
self.textedges._remove_unconnected_edges()
|
text_network._remove_unconnected_edges()
|
||||||
if debug_info_edges_searches is not None:
|
gaps_hv = text_network._compute_plausible_gaps()
|
||||||
# Preserve the current edge calculation for display debugging
|
|
||||||
debug_info_edges_searches.append(
|
|
||||||
copy.deepcopy(self.textedges)
|
|
||||||
)
|
|
||||||
gaps_hv = self.textedges._compute_plausible_gaps()
|
|
||||||
if gaps_hv is None:
|
if gaps_hv is None:
|
||||||
return None
|
return None
|
||||||
# edge_tol instructions override the calculated vertical gap
|
# edge_tol instructions override the calculated vertical gap
|
||||||
|
|
@ -782,13 +566,19 @@ class Hybrid(TextBaseParser):
|
||||||
gaps_hv[0],
|
gaps_hv[0],
|
||||||
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
gaps_hv[1] if self.edge_tol is None else self.edge_tol
|
||||||
)
|
)
|
||||||
bbox = self.textedges._build_bbox_candidate(
|
bbox = text_network._build_bbox_candidate(
|
||||||
edge_tol_hv,
|
edge_tol_hv,
|
||||||
debug_info=debug_info_bboxes_searches
|
parse_details=parse_details_bbox_searches
|
||||||
)
|
)
|
||||||
if bbox is None:
|
if bbox is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if parse_details_network_searches is not None:
|
||||||
|
# Preserve the current edge calculation for display debugging
|
||||||
|
parse_details_network_searches.append(
|
||||||
|
copy.deepcopy(text_network)
|
||||||
|
)
|
||||||
|
|
||||||
# Get all the textlines that are at least 50% in the box
|
# Get all the textlines that are at least 50% in the box
|
||||||
tls_in_bbox = text_in_bbox(bbox, textlines)
|
tls_in_bbox = text_in_bbox(bbox, textlines)
|
||||||
|
|
||||||
|
|
@ -808,10 +598,10 @@ class Hybrid(TextBaseParser):
|
||||||
gaps_hv[1]
|
gaps_hv[1]
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.debug_info is not None:
|
if self.parse_details is not None:
|
||||||
if "col_searches" not in self.debug_info:
|
if "col_searches" not in self.parse_details:
|
||||||
self.debug_info["col_searches"] = []
|
self.parse_details["col_searches"] = []
|
||||||
self.debug_info["col_searches"].append({
|
self.parse_details["col_searches"].append({
|
||||||
"core_bbox": bbox,
|
"core_bbox": bbox,
|
||||||
"cols_anchors": cols_anchors,
|
"cols_anchors": cols_anchors,
|
||||||
"expanded_bbox": expanded_bbox
|
"expanded_bbox": expanded_bbox
|
||||||
|
|
@ -826,95 +616,3 @@ class Hybrid(TextBaseParser):
|
||||||
lambda tl: tl not in textlines_processed,
|
lambda tl: tl not in textlines_processed,
|
||||||
textlines
|
textlines
|
||||||
))
|
))
|
||||||
|
|
||||||
# FRHTODO: Check is needed, refactor with Stream
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
|
||||||
# select elements which lie within table_bbox
|
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
|
||||||
bbox,
|
|
||||||
self.horizontal_text,
|
|
||||||
self.vertical_text
|
|
||||||
)
|
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
|
||||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
|
||||||
)
|
|
||||||
rows_grouped = self._group_rows(
|
|
||||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
|
||||||
elements = [len(r) for r in rows_grouped]
|
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
|
||||||
# user has to input boundary columns too
|
|
||||||
# take (0, pdf_width) by default
|
|
||||||
# similar to else condition
|
|
||||||
# len can't be 1
|
|
||||||
cols = self.columns[table_idx].split(",")
|
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
|
||||||
# calculate mode of the list of number of elements in
|
|
||||||
# each row to guess the number of columns
|
|
||||||
ncols = max(set(elements), key=elements.count)
|
|
||||||
if ncols == 1:
|
|
||||||
# if mode is 1, the page usually contains not tables
|
|
||||||
# but there can be cases where the list can be skewed,
|
|
||||||
# try to remove all 1s from list in this case and
|
|
||||||
# see if the list contains elements, if yes, then use
|
|
||||||
# the mode after removing 1s
|
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
|
||||||
if elements:
|
|
||||||
ncols = max(set(elements), key=elements.count)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found in table area {}"
|
|
||||||
.format(table_idx + 1)
|
|
||||||
)
|
|
||||||
cols = [
|
|
||||||
(t.x0, t.x1)
|
|
||||||
for r in rows_grouped
|
|
||||||
if len(r) == ncols
|
|
||||||
for t in r
|
|
||||||
]
|
|
||||||
cols = self._merge_columns(
|
|
||||||
sorted(cols),
|
|
||||||
column_tol=self.column_tol
|
|
||||||
)
|
|
||||||
inner_text = []
|
|
||||||
for i in range(1, len(cols)):
|
|
||||||
left = cols[i - 1][1]
|
|
||||||
right = cols[i][0]
|
|
||||||
inner_text.extend(
|
|
||||||
[
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > left and t.x1 < right
|
|
||||||
]
|
|
||||||
)
|
|
||||||
outer_text = [
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
|
||||||
]
|
|
||||||
inner_text.extend(outer_text)
|
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
|
||||||
|
|
||||||
return cols, rows, None, None
|
|
||||||
|
|
||||||
# FRHTODO: Check is needed, refactor with Stream
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
|
||||||
table = table.set_all_edges()
|
|
||||||
table.record_parse_metadata(self)
|
|
||||||
|
|
||||||
# for plotting
|
|
||||||
table._bbox = self.table_bbox
|
|
||||||
table._segments = None
|
|
||||||
table._textedges = self.textedges
|
|
||||||
|
|
||||||
return table
|
|
||||||
|
|
|
||||||
|
|
@ -168,6 +168,15 @@ class Lattice(BaseParser):
|
||||||
indices.append((r_idx, c_idx, text))
|
indices.append((r_idx, c_idx, text))
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
# for plotting
|
||||||
|
table._image = self.pdf_image # Reuse the image used for calc
|
||||||
|
table._bbox_unscaled = self.table_bbox_unscaled
|
||||||
|
table._segments = (self.vertical_segments, self.horizontal_segments)
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
def scale_areas(areas):
|
def scale_areas(areas):
|
||||||
scaled_areas = []
|
scaled_areas = []
|
||||||
|
|
@ -293,12 +302,5 @@ class Lattice(BaseParser):
|
||||||
# set spanning cells to True
|
# set spanning cells to True
|
||||||
table = table.set_span()
|
table = table.set_span()
|
||||||
|
|
||||||
table.record_parse_metadata(self)
|
self.record_parse_metadata(table)
|
||||||
|
|
||||||
# for plotting
|
|
||||||
table._image = self.pdf_image # Reuse the image used for calc
|
|
||||||
table._bbox_unscaled = self.table_bbox_unscaled
|
|
||||||
table._segments = (self.vertical_segments, self.horizontal_segments)
|
|
||||||
table._textedges = None
|
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,12 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import warnings
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from .base import TextBaseParser
|
from .base import TextBaseParser
|
||||||
from ..core import TextEdges
|
from ..core import TextEdges
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bbox_from_str,
|
bbox_from_str,
|
||||||
bbox_from_textlines,
|
text_in_bbox
|
||||||
text_in_bbox,
|
|
||||||
text_in_bbox_per_axis
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -79,182 +74,7 @@ class Stream(TextBaseParser):
|
||||||
row_tol=row_tol,
|
row_tol=row_tol,
|
||||||
column_tol=column_tol,
|
column_tol=column_tol,
|
||||||
)
|
)
|
||||||
|
self.textedges = []
|
||||||
@staticmethod
|
|
||||||
def _group_rows(text, row_tol=2):
|
|
||||||
"""Groups PDFMiner text objects into rows vertically
|
|
||||||
within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
row_tol : int, optional (default: 2)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_y = None
|
|
||||||
rows = []
|
|
||||||
temp = []
|
|
||||||
non_empty_text = [t for t in text if t.get_text().strip()]
|
|
||||||
for t in non_empty_text:
|
|
||||||
# is checking for upright necessary?
|
|
||||||
# if t.get_text().strip() and all([obj.upright \
|
|
||||||
# for obj in t._objs
|
|
||||||
# if type(obj) is LTChar]):
|
|
||||||
if row_y is None:
|
|
||||||
row_y = t.y0
|
|
||||||
elif not np.isclose(row_y, t.y0, atol=row_tol):
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
temp = []
|
|
||||||
# We update the row's bottom as we go, to be forgiving if there
|
|
||||||
# is a gradual change across multiple columns.
|
|
||||||
row_y = t.y0
|
|
||||||
temp.append(t)
|
|
||||||
rows.append(sorted(temp, key=lambda t: t.x0))
|
|
||||||
return rows
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _merge_columns(l, column_tol=0):
|
|
||||||
"""Merges column boundaries horizontally if they overlap
|
|
||||||
or lie within a tolerance.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
l : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
column_tol : int, optional (default: 0)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
merged : list
|
|
||||||
List of merged column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
merged = []
|
|
||||||
for higher in l:
|
|
||||||
if not merged:
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
lower = merged[-1]
|
|
||||||
if column_tol >= 0:
|
|
||||||
if higher[0] <= lower[1] or np.isclose(
|
|
||||||
higher[0], lower[1], atol=column_tol
|
|
||||||
):
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
elif column_tol < 0:
|
|
||||||
if higher[0] <= lower[1]:
|
|
||||||
if np.isclose(higher[0], lower[1],
|
|
||||||
atol=abs(column_tol)):
|
|
||||||
merged.append(higher)
|
|
||||||
else:
|
|
||||||
upper_bound = max(lower[1], higher[1])
|
|
||||||
lower_bound = min(lower[0], higher[0])
|
|
||||||
merged[-1] = (lower_bound, upper_bound)
|
|
||||||
else:
|
|
||||||
merged.append(higher)
|
|
||||||
return merged
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _join_rows(rows_grouped, text_y_max, text_y_min):
|
|
||||||
"""Makes row coordinates continuous. For the row to "touch"
|
|
||||||
we split the existing gap between them in half.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
rows_grouped : list
|
|
||||||
Two-dimensional list of text objects grouped into rows.
|
|
||||||
text_y_max : int
|
|
||||||
text_y_min : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rows : list
|
|
||||||
List of continuous row y-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_boundaries = [
|
|
||||||
[
|
|
||||||
max(t.y1 for t in r),
|
|
||||||
min(t.y0 for t in r)
|
|
||||||
]
|
|
||||||
for r in rows_grouped
|
|
||||||
]
|
|
||||||
for i in range(0, len(row_boundaries)-1):
|
|
||||||
top_row = row_boundaries[i]
|
|
||||||
bottom_row = row_boundaries[i+1]
|
|
||||||
top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
|
|
||||||
row_boundaries[0][0] = text_y_max
|
|
||||||
row_boundaries[-1][1] = text_y_min
|
|
||||||
return row_boundaries
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _add_columns(cols, text, row_tol):
|
|
||||||
"""Adds columns to existing list by taking into account
|
|
||||||
the text that lies outside the current column x-coordinates.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text : list
|
|
||||||
List of PDFMiner text objects.
|
|
||||||
ytol : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if text:
|
|
||||||
text = Stream._group_rows(text, row_tol=row_tol)
|
|
||||||
elements = [len(r) for r in text]
|
|
||||||
new_cols = [
|
|
||||||
(t.x0, t.x1)
|
|
||||||
for r in text if len(r) == max(elements)
|
|
||||||
for t in r
|
|
||||||
]
|
|
||||||
cols.extend(Stream._merge_columns(sorted(new_cols)))
|
|
||||||
return cols
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _join_columns(cols, text_x_min, text_x_max):
|
|
||||||
"""Makes column coordinates continuous.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cols : list
|
|
||||||
List of column x-coordinate tuples.
|
|
||||||
text_x_min : int
|
|
||||||
text_y_max : int
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
cols : list
|
|
||||||
Updated list of column x-coordinate tuples.
|
|
||||||
|
|
||||||
"""
|
|
||||||
cols = sorted(cols)
|
|
||||||
cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
return cols
|
|
||||||
|
|
||||||
def _validate_columns(self):
|
|
||||||
if self.table_areas is not None and self.columns is not None:
|
|
||||||
if len(self.table_areas) != len(self.columns):
|
|
||||||
raise ValueError("Length of table_areas and columns"
|
|
||||||
" should be equal")
|
|
||||||
|
|
||||||
def _nurminen_table_detection(self, textlines):
|
def _nurminen_table_detection(self, textlines):
|
||||||
"""A general implementation of the table detection algorithm
|
"""A general implementation of the table detection algorithm
|
||||||
|
|
@ -281,8 +101,13 @@ class Stream(TextBaseParser):
|
||||||
|
|
||||||
return table_bbox
|
return table_bbox
|
||||||
|
|
||||||
|
def record_parse_metadata(self, table):
|
||||||
|
"""Record data about the origin of the table
|
||||||
|
"""
|
||||||
|
super().record_parse_metadata(table)
|
||||||
|
table._textedges = self.textedges
|
||||||
|
|
||||||
def _generate_table_bbox(self):
|
def _generate_table_bbox(self):
|
||||||
self.textedges = []
|
|
||||||
if self.table_areas is None:
|
if self.table_areas is None:
|
||||||
hor_text = self.horizontal_text
|
hor_text = self.horizontal_text
|
||||||
if self.table_regions is not None:
|
if self.table_regions is not None:
|
||||||
|
|
@ -300,93 +125,3 @@ class Stream(TextBaseParser):
|
||||||
for area_str in self.table_areas:
|
for area_str in self.table_areas:
|
||||||
table_bbox[bbox_from_str(area_str)] = None
|
table_bbox[bbox_from_str(area_str)] = None
|
||||||
self.table_bbox = table_bbox
|
self.table_bbox = table_bbox
|
||||||
|
|
||||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
|
||||||
# select elements which lie within table_bbox
|
|
||||||
self.t_bbox = text_in_bbox_per_axis(
|
|
||||||
bbox,
|
|
||||||
self.horizontal_text,
|
|
||||||
self.vertical_text
|
|
||||||
)
|
|
||||||
|
|
||||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
|
||||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
|
||||||
)
|
|
||||||
rows_grouped = self._group_rows(
|
|
||||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
|
||||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
|
||||||
elements = [len(r) for r in rows_grouped]
|
|
||||||
|
|
||||||
if self.columns is not None and self.columns[table_idx] != "":
|
|
||||||
# user has to input boundary columns too
|
|
||||||
# take (0, pdf_width) by default
|
|
||||||
# similar to else condition
|
|
||||||
# len can't be 1
|
|
||||||
cols = self.columns[table_idx].split(",")
|
|
||||||
cols = [float(c) for c in cols]
|
|
||||||
cols.insert(0, text_x_min)
|
|
||||||
cols.append(text_x_max)
|
|
||||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
|
||||||
else:
|
|
||||||
# calculate mode of the list of number of elements in
|
|
||||||
# each row to guess the number of columns
|
|
||||||
ncols = max(set(elements), key=elements.count)
|
|
||||||
if ncols == 1:
|
|
||||||
# if mode is 1, the page usually contains not tables
|
|
||||||
# but there can be cases where the list can be skewed,
|
|
||||||
# try to remove all 1s from list in this case and
|
|
||||||
# see if the list contains elements, if yes, then use
|
|
||||||
# the mode after removing 1s
|
|
||||||
elements = list(filter(lambda x: x != 1, elements))
|
|
||||||
if elements:
|
|
||||||
ncols = max(set(elements), key=elements.count)
|
|
||||||
else:
|
|
||||||
warnings.warn(
|
|
||||||
"No tables found in table area {}"
|
|
||||||
.format(table_idx + 1)
|
|
||||||
)
|
|
||||||
cols = [
|
|
||||||
(t.x0, t.x1)
|
|
||||||
for r in rows_grouped
|
|
||||||
if len(r) == ncols
|
|
||||||
for t in r
|
|
||||||
]
|
|
||||||
cols = self._merge_columns(
|
|
||||||
sorted(cols),
|
|
||||||
column_tol=self.column_tol
|
|
||||||
)
|
|
||||||
inner_text = []
|
|
||||||
for i in range(1, len(cols)):
|
|
||||||
left = cols[i - 1][1]
|
|
||||||
right = cols[i][0]
|
|
||||||
inner_text.extend(
|
|
||||||
[
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > left and t.x1 < right
|
|
||||||
]
|
|
||||||
)
|
|
||||||
outer_text = [
|
|
||||||
t
|
|
||||||
for direction in self.t_bbox
|
|
||||||
for t in self.t_bbox[direction]
|
|
||||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
|
||||||
]
|
|
||||||
inner_text.extend(outer_text)
|
|
||||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
|
||||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
|
||||||
|
|
||||||
return cols, rows, None, None
|
|
||||||
|
|
||||||
def _generate_table(self, table_idx, cols, rows, **kwargs):
|
|
||||||
table = self._initialize_new_table(table_idx, cols, rows)
|
|
||||||
table = table.set_all_edges()
|
|
||||||
table.record_parse_metadata(self)
|
|
||||||
|
|
||||||
# for plotting
|
|
||||||
table._bbox = self.table_bbox
|
|
||||||
table._segments = None
|
|
||||||
table._textedges = self.textedges
|
|
||||||
|
|
||||||
return table
|
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,9 @@ def draw_parse_constraints(table, ax):
|
||||||
ax : matplotlib.axes.Axes
|
ax : matplotlib.axes.Axes
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if table.debug_info:
|
if table.parse_details:
|
||||||
# Display a bbox per region
|
# Display a bbox per region
|
||||||
for region_str in table.debug_info["table_regions"] or []:
|
for region_str in table.parse_details["table_regions"] or []:
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, bbox_from_str(region_str),
|
ax, bbox_from_str(region_str),
|
||||||
"region: ({region_str})".format(region_str=region_str),
|
"region: ({region_str})".format(region_str=region_str),
|
||||||
|
|
@ -99,7 +99,7 @@ def draw_parse_constraints(table, ax):
|
||||||
label_pos="bottom,right"
|
label_pos="bottom,right"
|
||||||
)
|
)
|
||||||
# Display a bbox per area
|
# Display a bbox per area
|
||||||
for area_str in table.debug_info["table_areas"] or []:
|
for area_str in table.parse_details["table_areas"] or []:
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, bbox_from_str(area_str),
|
ax, bbox_from_str(area_str),
|
||||||
"area: ({area_str})".format(area_str=area_str),
|
"area: ({area_str})".format(area_str=area_str),
|
||||||
|
|
@ -294,8 +294,27 @@ class PlotMethods(object):
|
||||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||||
|
|
||||||
if table.flavor == "hybrid":
|
if table.flavor == "hybrid":
|
||||||
# FRHTODO: Clean this up
|
for text_network in table.parse_details["network_searches"]:
|
||||||
table.debug_info["edges_searches"][0].plot_alignments(ax)
|
# FRHTODO: This is too busy and doesn't plot lines
|
||||||
|
most_connected_tl = text_network.most_connected_textline()
|
||||||
|
|
||||||
|
ax.add_patch(
|
||||||
|
patches.Rectangle(
|
||||||
|
(most_connected_tl.x0, most_connected_tl.y0),
|
||||||
|
most_connected_tl.x1 - most_connected_tl.x0,
|
||||||
|
most_connected_tl.y1 - most_connected_tl.y0,
|
||||||
|
color="red",
|
||||||
|
alpha=0.5
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for tl, alignments in text_network._textlines_alignments.items():
|
||||||
|
ax.text(
|
||||||
|
tl.x0 - 5,
|
||||||
|
tl.y0 - 5,
|
||||||
|
f"{alignments.max_h_count()}x{alignments.max_v_count()}",
|
||||||
|
fontsize=5,
|
||||||
|
color="black"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
for te in table._textedges:
|
for te in table._textedges:
|
||||||
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
ax.plot([te.coord, te.coord], [te.y0, te.y1])
|
||||||
|
|
@ -372,10 +391,10 @@ class PlotMethods(object):
|
||||||
draw_pdf(table, ax)
|
draw_pdf(table, ax)
|
||||||
draw_parse_constraints(table, ax)
|
draw_parse_constraints(table, ax)
|
||||||
|
|
||||||
if table.debug_info is None:
|
if table.parse_details is None:
|
||||||
return fig
|
return fig
|
||||||
debug_info = table.debug_info
|
parse_details = table.parse_details
|
||||||
for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
|
for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
|
||||||
max_h_gap = bbox_search["max_h_gap"]
|
max_h_gap = bbox_search["max_h_gap"]
|
||||||
max_v_gap = bbox_search["max_v_gap"]
|
max_v_gap = bbox_search["max_v_gap"]
|
||||||
iterations = bbox_search["iterations"]
|
iterations = bbox_search["iterations"]
|
||||||
|
|
@ -403,7 +422,7 @@ class PlotMethods(object):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
for box_id, col_search in enumerate(debug_info["col_searches"]):
|
for box_id, col_search in enumerate(parse_details["col_searches"]):
|
||||||
draw_labeled_bbox(
|
draw_labeled_bbox(
|
||||||
ax, col_search["expanded_bbox"],
|
ax, col_search["expanded_bbox"],
|
||||||
"box body + header #{box_id}".format(
|
"box body + header #{box_id}".format(
|
||||||
|
|
@ -422,10 +441,5 @@ class PlotMethods(object):
|
||||||
linewidth=2,
|
linewidth=2,
|
||||||
label_pos="bottom,left"
|
label_pos="bottom,left"
|
||||||
)
|
)
|
||||||
# self.debug_info["col_searches"].append({
|
|
||||||
# "core_bbox": bbox,
|
|
||||||
# "cols_anchors": cols_anchors,
|
|
||||||
# "expanded_bbox": expanded_bbox
|
|
||||||
# })
|
|
||||||
|
|
||||||
return fig
|
return fig
|
||||||
|
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 105 KiB After Width: | Height: | Size: 105 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 197 KiB After Width: | Height: | Size: 192 KiB |
Loading…
Reference in New Issue