Prep for vertical text improvements

plot.text shows vertical text in red
_generate_columns_and_rows split between hybrid and stream
pull/153/head
Frh 2020-04-28 11:46:12 -07:00
parent c51c24a416
commit 6add19ae27
12 changed files with 250 additions and 151 deletions

View File

@ -171,11 +171,10 @@ class TextAlignments():
idx_insert = None idx_insert = None
if idx_closest is None: if idx_closest is None:
idx_insert = 0 idx_insert = 0
elif np.isclose( # Note: np.isclose is slow!
alignment_array[idx_closest].coord, elif coord - 0.5 < \
coord, alignment_array[idx_closest].coord < \
atol=0.5 coord + 0.5:
):
self._update_alignment( self._update_alignment(
alignment_array[idx_closest], alignment_array[idx_closest],
coord, coord,
@ -461,6 +460,7 @@ class Table():
self._image_path = None # Temporary file to hold an image of the pdf self._image_path = None # Temporary file to hold an image of the pdf
self._text = [] # List of text box coordinates self._text = [] # List of text box coordinates
self.textlines = [] # List of actual textlines on the page
def __repr__(self): def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape) return "<{} shape={}>".format(self.__class__.__name__, self.shape)

View File

@ -8,13 +8,11 @@ import pandas as pd
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
bbox_from_textlines,
compute_accuracy, compute_accuracy,
compute_whitespace, compute_whitespace,
get_text_objects, get_text_objects,
get_table_index, get_table_index,
text_in_bbox, text_in_bbox,
text_in_bbox_per_axis,
) )
from ..core import Table from ..core import Table
@ -243,6 +241,7 @@ class BaseParser():
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text]) [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text]) _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text table._text = _text
table.textlines = self.horizontal_text + self.vertical_text
class TextBaseParser(BaseParser): class TextBaseParser(BaseParser):
@ -454,84 +453,6 @@ class TextBaseParser(BaseParser):
raise ValueError("Length of table_areas and columns" raise ValueError("Length of table_areas and columns"
" should be equal") " should be equal")
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None
def record_parse_metadata(self, table): def record_parse_metadata(self, table):
"""Record data about the origin of the table """Record data about the origin of the table
""" """

View File

@ -6,6 +6,7 @@ from __future__ import division
import copy import copy
import math import math
import numpy as np import numpy as np
import warnings
from .base import TextBaseParser from .base import TextBaseParser
from ..core import ( from ..core import (
@ -20,7 +21,8 @@ from ..utils import (
text_in_bbox, text_in_bbox,
bbox_from_textlines, bbox_from_textlines,
distance_tl_to_bbox, distance_tl_to_bbox,
find_columns_coordinates find_columns_coordinates,
text_in_bbox_per_axis,
) )
# maximum number of columns over which a header can spread # maximum number of columns over which a header can spread
@ -574,3 +576,91 @@ class Hybrid(TextBaseParser):
lambda tl: tl not in textlines_processed, lambda tl: tl not in textlines_processed,
textlines textlines
)) ))
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
all_tls = list(
filter(
lambda tl: len(tl.get_text().strip()) > 0,
self.t_bbox["horizontal"] # + self.t_bbox["vertical"]
)
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
all_tls
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None

View File

@ -2,11 +2,15 @@
from __future__ import division from __future__ import division
import warnings
from .base import TextBaseParser from .base import TextBaseParser
from ..core import TextEdges from ..core import TextEdges
from ..utils import ( from ..utils import (
bbox_from_str, bbox_from_str,
text_in_bbox bbox_from_textlines,
text_in_bbox,
text_in_bbox_per_axis,
) )
@ -124,3 +128,86 @@ class Stream(TextBaseParser):
for area_str in self.table_areas: for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox self.table_bbox = table_bbox
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None

View File

@ -8,7 +8,20 @@ except ImportError:
else: else:
_HAS_MPL = True _HAS_MPL = True
from .utils import (bbox_from_str, get_textline_coords) from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
from pdfminer.layout import (
LTTextLineVertical,
)
def extend_axe_lim(ax, bbox, margin=10):
"""Ensure the ax limits include the input bbox
"""
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
def draw_labeled_bbox( def draw_labeled_bbox(
@ -17,6 +30,8 @@ def draw_labeled_bbox(
linestyle="solid", linestyle="solid",
label_pos="top,left" label_pos="top,left"
): ):
"""Utility drawing function to draw a box with an associated text label
"""
ax.add_patch( ax.add_patch(
patches.Rectangle( patches.Rectangle(
(bbox[0], bbox[1]), (bbox[0], bbox[1]),
@ -83,32 +98,55 @@ def draw_parse_constraints(table, ax):
Parameters Parameters
---------- ----------
table : camelot.core.Table table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes ax : matplotlib.axes.Axes (optional)
""" """
if table.parse_details: if table.parse_details:
# Display a bbox per region zone_constraints = {
for region_str in table.parse_details["table_regions"] or []: "region": "table_regions",
"area": "table_areas",
}
for zone_name, zone_id in zone_constraints.items():
# Display a bbox per region / area
for zone_str in table.parse_details[zone_id] or []:
draw_labeled_bbox( draw_labeled_bbox(
ax, bbox_from_str(region_str), ax, bbox_from_str(zone_str),
"region: ({region_str})".format(region_str=region_str), "{zone_name}: ({zone_str})".format(
zone_name=zone_name,
zone_str=zone_str
),
color="purple", color="purple",
linestyle="dotted", linestyle="dotted",
linewidth=1, linewidth=1,
label_pos="bottom,right" label_pos="bottom,right"
) )
# Display a bbox per area
for area_str in table.parse_details["table_areas"] or []:
draw_labeled_bbox( def draw_text(table, ax):
ax, bbox_from_str(area_str), """Draw text, horizontal in blue, vertical in red
"area: ({area_str})".format(area_str=area_str),
color="pink", Parameters
linestyle="dotted", ----------
linewidth=1, table : camelot.core.Table
label_pos="bottom,right" ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
"""
bbox = bbox_from_textlines(table.textlines)
for t in table.textlines:
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
ax.add_patch(
patches.Rectangle(
(t.x0, t.y0),
t.x1 - t.x0,
t.y1 - t.y0,
color=color,
alpha=0.2
) )
)
extend_axe_lim(ax, bbox)
def prepare_plot(table, ax=None, to_pdf_scale=True): def prepare_plot(table, ax=None, to_pdf_scale=True):
@ -188,20 +226,7 @@ class PlotMethods():
""" """
ax = prepare_plot(table, ax) ax = prepare_plot(table, ax)
xs, ys = [], [] draw_text(table, ax)
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
alpha=0.5
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return ax.get_figure() return ax.get_figure()
@staticmethod @staticmethod
@ -255,18 +280,8 @@ class PlotMethods():
else: else:
table_bbox = {table._bbox: None} table_bbox = {table._bbox: None}
xs, ys = [], []
if not _FOR_LATTICE: if not _FOR_LATTICE:
for t in table._text: draw_text(table, ax)
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.5
)
)
for t in table_bbox.keys(): for t in table_bbox.keys():
ax.add_patch( ax.add_patch(
@ -276,10 +291,8 @@ class PlotMethods():
) )
) )
if not _FOR_LATTICE: if not _FOR_LATTICE:
xs.extend([t[0], t[2]]) extend_axe_lim(ax, t)
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
return ax.get_figure() return ax.get_figure()
@staticmethod @staticmethod
@ -297,19 +310,7 @@ class PlotMethods():
""" """
ax = prepare_plot(table, ax) ax = prepare_plot(table, ax)
xs, ys = [], [] draw_text(table, ax)
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.2
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
if table.flavor == "hybrid": if table.flavor == "hybrid":
for network in table.parse_details["network_searches"]: for network in table.parse_details["network_searches"]:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

After

Width:  |  Height:  |  Size: 59 KiB