Prep for vertical text improvements

plot.text shows vertical text in red
_generate_columns_and_rows split between hybrid and stream
pull/153/head
Frh 2020-04-28 11:46:12 -07:00
parent c51c24a416
commit 6add19ae27
12 changed files with 250 additions and 151 deletions

View File

@ -171,11 +171,10 @@ class TextAlignments():
idx_insert = None
if idx_closest is None:
idx_insert = 0
elif np.isclose(
alignment_array[idx_closest].coord,
coord,
atol=0.5
):
# Note: np.isclose is slow!
elif coord - 0.5 < \
alignment_array[idx_closest].coord < \
coord + 0.5:
self._update_alignment(
alignment_array[idx_closest],
coord,
@ -461,6 +460,7 @@ class Table():
self._image_path = None # Temporary file to hold an image of the pdf
self._text = [] # List of text box coordinates
self.textlines = [] # List of actual textlines on the page
def __repr__(self):
return "<{} shape={}>".format(self.__class__.__name__, self.shape)

View File

@ -8,13 +8,11 @@ import pandas as pd
from ..utils import (
bbox_from_str,
bbox_from_textlines,
compute_accuracy,
compute_whitespace,
get_text_objects,
get_table_index,
text_in_bbox,
text_in_bbox_per_axis,
)
from ..core import Table
@ -243,6 +241,7 @@ class BaseParser():
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
table._text = _text
table.textlines = self.horizontal_text + self.vertical_text
class TextBaseParser(BaseParser):
@ -454,84 +453,6 @@ class TextBaseParser(BaseParser):
raise ValueError("Length of table_areas and columns"
" should be equal")
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None
def record_parse_metadata(self, table):
"""Record data about the origin of the table
"""

View File

@ -6,6 +6,7 @@ from __future__ import division
import copy
import math
import numpy as np
import warnings
from .base import TextBaseParser
from ..core import (
@ -20,7 +21,8 @@ from ..utils import (
text_in_bbox,
bbox_from_textlines,
distance_tl_to_bbox,
find_columns_coordinates
find_columns_coordinates,
text_in_bbox_per_axis,
)
# maximum number of columns over which a header can spread
@ -574,3 +576,91 @@ class Hybrid(TextBaseParser):
lambda tl: tl not in textlines_processed,
textlines
))
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
all_tls = list(
filter(
lambda tl: len(tl.get_text().strip()) > 0,
self.t_bbox["horizontal"] # + self.t_bbox["vertical"]
)
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
all_tls
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None

View File

@ -2,11 +2,15 @@
from __future__ import division
import warnings
from .base import TextBaseParser
from ..core import TextEdges
from ..utils import (
bbox_from_str,
text_in_bbox
bbox_from_textlines,
text_in_bbox,
text_in_bbox_per_axis,
)
@ -124,3 +128,86 @@ class Stream(TextBaseParser):
for area_str in self.table_areas:
table_bbox[bbox_from_str(area_str)] = None
self.table_bbox = table_bbox
def _generate_columns_and_rows(self, bbox, table_idx):
# select elements which lie within table_bbox
self.t_bbox = text_in_bbox_per_axis(
bbox,
self.horizontal_text,
self.vertical_text
)
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
)
# FRHTODO:
# This algorithm takes the horizontal textlines in the bbox, and groups
# them into rows based on their bottom y0.
# That's wrong: it misses the vertical items, and misses out on all
# the alignment identification work we've done earlier.
rows_grouped = self._group_rows(
self.t_bbox["horizontal"], row_tol=self.row_tol)
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
elements = [len(r) for r in rows_grouped]
if self.columns is not None and self.columns[table_idx] != "":
# user has to input boundary columns too
# take (0, pdf_width) by default
# similar to else condition
# len can't be 1
cols = self.columns[table_idx].split(",")
cols = [float(c) for c in cols]
cols.insert(0, text_x_min)
cols.append(text_x_max)
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
else:
# calculate mode of the list of number of elements in
# each row to guess the number of columns
ncols = max(set(elements), key=elements.count)
if ncols == 1:
# if mode is 1, the page usually contains not tables
# but there can be cases where the list can be skewed,
# try to remove all 1s from list in this case and
# see if the list contains elements, if yes, then use
# the mode after removing 1s
elements = list(filter(lambda x: x != 1, elements))
if elements:
ncols = max(set(elements), key=elements.count)
else:
warnings.warn(
"No tables found in table area {}"
.format(table_idx + 1)
)
cols = [
(t.x0, t.x1)
for r in rows_grouped
if len(r) == ncols
for t in r
]
cols = self._merge_columns(
sorted(cols),
column_tol=self.column_tol
)
inner_text = []
for i in range(1, len(cols)):
left = cols[i - 1][1]
right = cols[i][0]
inner_text.extend(
[
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > left and t.x1 < right
]
)
outer_text = [
t
for direction in self.t_bbox
for t in self.t_bbox[direction]
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
]
inner_text.extend(outer_text)
cols = self._add_columns(cols, inner_text, self.row_tol)
cols = self._join_columns(cols, text_x_min, text_x_max)
return cols, rows, None, None

View File

@ -8,7 +8,20 @@ except ImportError:
else:
_HAS_MPL = True
from .utils import (bbox_from_str, get_textline_coords)
from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
from pdfminer.layout import (
LTTextLineVertical,
)
def extend_axe_lim(ax, bbox, margin=10):
"""Ensure the ax limits include the input bbox
"""
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
def draw_labeled_bbox(
@ -17,6 +30,8 @@ def draw_labeled_bbox(
linestyle="solid",
label_pos="top,left"
):
"""Utility drawing function to draw a box with an associated text label
"""
ax.add_patch(
patches.Rectangle(
(bbox[0], bbox[1]),
@ -83,32 +98,55 @@ def draw_parse_constraints(table, ax):
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
ax : matplotlib.axes.Axes (optional)
"""
if table.parse_details:
# Display a bbox per region
for region_str in table.parse_details["table_regions"] or []:
zone_constraints = {
"region": "table_regions",
"area": "table_areas",
}
for zone_name, zone_id in zone_constraints.items():
# Display a bbox per region / area
for zone_str in table.parse_details[zone_id] or []:
draw_labeled_bbox(
ax, bbox_from_str(region_str),
"region: ({region_str})".format(region_str=region_str),
ax, bbox_from_str(zone_str),
"{zone_name}: ({zone_str})".format(
zone_name=zone_name,
zone_str=zone_str
),
color="purple",
linestyle="dotted",
linewidth=1,
label_pos="bottom,right"
)
# Display a bbox per area
for area_str in table.parse_details["table_areas"] or []:
draw_labeled_bbox(
ax, bbox_from_str(area_str),
"area: ({area_str})".format(area_str=area_str),
color="pink",
linestyle="dotted",
linewidth=1,
label_pos="bottom,right"
def draw_text(table, ax):
"""Draw text, horizontal in blue, vertical in red
Parameters
----------
table : camelot.core.Table
ax : matplotlib.axes.Axes (optional)
ax : matplotlib.axes.Axes
"""
bbox = bbox_from_textlines(table.textlines)
for t in table.textlines:
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
ax.add_patch(
patches.Rectangle(
(t.x0, t.y0),
t.x1 - t.x0,
t.y1 - t.y0,
color=color,
alpha=0.2
)
)
extend_axe_lim(ax, bbox)
def prepare_plot(table, ax=None, to_pdf_scale=True):
@ -188,20 +226,7 @@ class PlotMethods():
"""
ax = prepare_plot(table, ax)
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]),
t[2] - t[0],
t[3] - t[1],
alpha=0.5
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
draw_text(table, ax)
return ax.get_figure()
@staticmethod
@ -255,18 +280,8 @@ class PlotMethods():
else:
table_bbox = {table._bbox: None}
xs, ys = [], []
if not _FOR_LATTICE:
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.5
)
)
draw_text(table, ax)
for t in table_bbox.keys():
ax.add_patch(
@ -276,10 +291,8 @@ class PlotMethods():
)
)
if not _FOR_LATTICE:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
extend_axe_lim(ax, t)
return ax.get_figure()
@staticmethod
@ -297,19 +310,7 @@ class PlotMethods():
"""
ax = prepare_plot(table, ax)
xs, ys = [], []
for t in table._text:
xs.extend([t[0], t[2]])
ys.extend([t[1], t[3]])
ax.add_patch(
patches.Rectangle(
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
color="blue",
alpha=0.2
)
)
ax.set_xlim(min(xs) - 10, max(xs) + 10)
ax.set_ylim(min(ys) - 10, max(ys) + 10)
draw_text(table, ax)
if table.flavor == "hybrid":
for network in table.parse_details["network_searches"]:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 105 KiB

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 90 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

After

Width:  |  Height:  |  Size: 59 KiB