Prep for vertical text improvements
plot.text shows vertical text in red _generate_columns_and_rows split between hybrid and streampull/153/head
|
|
@ -171,11 +171,10 @@ class TextAlignments():
|
|||
idx_insert = None
|
||||
if idx_closest is None:
|
||||
idx_insert = 0
|
||||
elif np.isclose(
|
||||
alignment_array[idx_closest].coord,
|
||||
coord,
|
||||
atol=0.5
|
||||
):
|
||||
# Note: np.isclose is slow!
|
||||
elif coord - 0.5 < \
|
||||
alignment_array[idx_closest].coord < \
|
||||
coord + 0.5:
|
||||
self._update_alignment(
|
||||
alignment_array[idx_closest],
|
||||
coord,
|
||||
|
|
@ -461,6 +460,7 @@ class Table():
|
|||
self._image_path = None # Temporary file to hold an image of the pdf
|
||||
|
||||
self._text = [] # List of text box coordinates
|
||||
self.textlines = [] # List of actual textlines on the page
|
||||
|
||||
def __repr__(self):
|
||||
return "<{} shape={}>".format(self.__class__.__name__, self.shape)
|
||||
|
|
|
|||
|
|
@ -8,13 +8,11 @@ import pandas as pd
|
|||
|
||||
from ..utils import (
|
||||
bbox_from_str,
|
||||
bbox_from_textlines,
|
||||
compute_accuracy,
|
||||
compute_whitespace,
|
||||
get_text_objects,
|
||||
get_table_index,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis,
|
||||
)
|
||||
from ..core import Table
|
||||
|
||||
|
|
@ -243,6 +241,7 @@ class BaseParser():
|
|||
[(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
|
||||
_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
|
||||
table._text = _text
|
||||
table.textlines = self.horizontal_text + self.vertical_text
|
||||
|
||||
|
||||
class TextBaseParser(BaseParser):
|
||||
|
|
@ -454,84 +453,6 @@ class TextBaseParser(BaseParser):
|
|||
raise ValueError("Length of table_areas and columns"
|
||||
" should be equal")
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
bbox,
|
||||
self.horizontal_text,
|
||||
self.vertical_text
|
||||
)
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||
)
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
# calculate mode of the list of number of elements in
|
||||
# each row to guess the number of columns
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
if ncols == 1:
|
||||
# if mode is 1, the page usually contains not tables
|
||||
# but there can be cases where the list can be skewed,
|
||||
# try to remove all 1s from list in this case and
|
||||
# see if the list contains elements, if yes, then use
|
||||
# the mode after removing 1s
|
||||
elements = list(filter(lambda x: x != 1, elements))
|
||||
if elements:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found in table area {}"
|
||||
.format(table_idx + 1)
|
||||
)
|
||||
cols = [
|
||||
(t.x0, t.x1)
|
||||
for r in rows_grouped
|
||||
if len(r) == ncols
|
||||
for t in r
|
||||
]
|
||||
cols = self._merge_columns(
|
||||
sorted(cols),
|
||||
column_tol=self.column_tol
|
||||
)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend(
|
||||
[
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right
|
||||
]
|
||||
)
|
||||
outer_text = [
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||
]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
return cols, rows, None, None
|
||||
|
||||
def record_parse_metadata(self, table):
|
||||
"""Record data about the origin of the table
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from __future__ import division
|
|||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from .base import TextBaseParser
|
||||
from ..core import (
|
||||
|
|
@ -20,7 +21,8 @@ from ..utils import (
|
|||
text_in_bbox,
|
||||
bbox_from_textlines,
|
||||
distance_tl_to_bbox,
|
||||
find_columns_coordinates
|
||||
find_columns_coordinates,
|
||||
text_in_bbox_per_axis,
|
||||
)
|
||||
|
||||
# maximum number of columns over which a header can spread
|
||||
|
|
@ -574,3 +576,91 @@ class Hybrid(TextBaseParser):
|
|||
lambda tl: tl not in textlines_processed,
|
||||
textlines
|
||||
))
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
bbox,
|
||||
self.horizontal_text,
|
||||
self.vertical_text
|
||||
)
|
||||
|
||||
all_tls = list(
|
||||
filter(
|
||||
lambda tl: len(tl.get_text().strip()) > 0,
|
||||
self.t_bbox["horizontal"] # + self.t_bbox["vertical"]
|
||||
)
|
||||
)
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
all_tls
|
||||
)
|
||||
# FRHTODO:
|
||||
# This algorithm takes the horizontal textlines in the bbox, and groups
|
||||
# them into rows based on their bottom y0.
|
||||
# That's wrong: it misses the vertical items, and misses out on all
|
||||
# the alignment identification work we've done earlier.
|
||||
rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
# calculate mode of the list of number of elements in
|
||||
# each row to guess the number of columns
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
if ncols == 1:
|
||||
# if mode is 1, the page usually contains not tables
|
||||
# but there can be cases where the list can be skewed,
|
||||
# try to remove all 1s from list in this case and
|
||||
# see if the list contains elements, if yes, then use
|
||||
# the mode after removing 1s
|
||||
elements = list(filter(lambda x: x != 1, elements))
|
||||
if elements:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found in table area {}"
|
||||
.format(table_idx + 1)
|
||||
)
|
||||
cols = [
|
||||
(t.x0, t.x1)
|
||||
for r in rows_grouped
|
||||
if len(r) == ncols
|
||||
for t in r
|
||||
]
|
||||
cols = self._merge_columns(
|
||||
sorted(cols),
|
||||
column_tol=self.column_tol
|
||||
)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend(
|
||||
[
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right
|
||||
]
|
||||
)
|
||||
outer_text = [
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||
]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
return cols, rows, None, None
|
||||
|
|
|
|||
|
|
@ -2,11 +2,15 @@
|
|||
|
||||
from __future__ import division
|
||||
|
||||
import warnings
|
||||
|
||||
from .base import TextBaseParser
|
||||
from ..core import TextEdges
|
||||
from ..utils import (
|
||||
bbox_from_str,
|
||||
text_in_bbox
|
||||
bbox_from_textlines,
|
||||
text_in_bbox,
|
||||
text_in_bbox_per_axis,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -124,3 +128,86 @@ class Stream(TextBaseParser):
|
|||
for area_str in self.table_areas:
|
||||
table_bbox[bbox_from_str(area_str)] = None
|
||||
self.table_bbox = table_bbox
|
||||
|
||||
def _generate_columns_and_rows(self, bbox, table_idx):
|
||||
# select elements which lie within table_bbox
|
||||
self.t_bbox = text_in_bbox_per_axis(
|
||||
bbox,
|
||||
self.horizontal_text,
|
||||
self.vertical_text
|
||||
)
|
||||
|
||||
text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
|
||||
self.t_bbox["horizontal"] + self.t_bbox["vertical"]
|
||||
)
|
||||
# FRHTODO:
|
||||
# This algorithm takes the horizontal textlines in the bbox, and groups
|
||||
# them into rows based on their bottom y0.
|
||||
# That's wrong: it misses the vertical items, and misses out on all
|
||||
# the alignment identification work we've done earlier.
|
||||
rows_grouped = self._group_rows(
|
||||
self.t_bbox["horizontal"], row_tol=self.row_tol)
|
||||
rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
|
||||
elements = [len(r) for r in rows_grouped]
|
||||
|
||||
if self.columns is not None and self.columns[table_idx] != "":
|
||||
# user has to input boundary columns too
|
||||
# take (0, pdf_width) by default
|
||||
# similar to else condition
|
||||
# len can't be 1
|
||||
cols = self.columns[table_idx].split(",")
|
||||
cols = [float(c) for c in cols]
|
||||
cols.insert(0, text_x_min)
|
||||
cols.append(text_x_max)
|
||||
cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
|
||||
else:
|
||||
# calculate mode of the list of number of elements in
|
||||
# each row to guess the number of columns
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
if ncols == 1:
|
||||
# if mode is 1, the page usually contains not tables
|
||||
# but there can be cases where the list can be skewed,
|
||||
# try to remove all 1s from list in this case and
|
||||
# see if the list contains elements, if yes, then use
|
||||
# the mode after removing 1s
|
||||
elements = list(filter(lambda x: x != 1, elements))
|
||||
if elements:
|
||||
ncols = max(set(elements), key=elements.count)
|
||||
else:
|
||||
warnings.warn(
|
||||
"No tables found in table area {}"
|
||||
.format(table_idx + 1)
|
||||
)
|
||||
cols = [
|
||||
(t.x0, t.x1)
|
||||
for r in rows_grouped
|
||||
if len(r) == ncols
|
||||
for t in r
|
||||
]
|
||||
cols = self._merge_columns(
|
||||
sorted(cols),
|
||||
column_tol=self.column_tol
|
||||
)
|
||||
inner_text = []
|
||||
for i in range(1, len(cols)):
|
||||
left = cols[i - 1][1]
|
||||
right = cols[i][0]
|
||||
inner_text.extend(
|
||||
[
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > left and t.x1 < right
|
||||
]
|
||||
)
|
||||
outer_text = [
|
||||
t
|
||||
for direction in self.t_bbox
|
||||
for t in self.t_bbox[direction]
|
||||
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
|
||||
]
|
||||
inner_text.extend(outer_text)
|
||||
cols = self._add_columns(cols, inner_text, self.row_tol)
|
||||
cols = self._join_columns(cols, text_x_min, text_x_max)
|
||||
|
||||
return cols, rows, None, None
|
||||
|
|
|
|||
|
|
@ -8,7 +8,20 @@ except ImportError:
|
|||
else:
|
||||
_HAS_MPL = True
|
||||
|
||||
from .utils import (bbox_from_str, get_textline_coords)
|
||||
from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
|
||||
|
||||
from pdfminer.layout import (
|
||||
LTTextLineVertical,
|
||||
)
|
||||
|
||||
|
||||
def extend_axe_lim(ax, bbox, margin=10):
|
||||
"""Ensure the ax limits include the input bbox
|
||||
"""
|
||||
x0, x1 = ax.get_xlim()
|
||||
y0, y1 = ax.get_ylim()
|
||||
ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
|
||||
ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
|
||||
|
||||
|
||||
def draw_labeled_bbox(
|
||||
|
|
@ -17,6 +30,8 @@ def draw_labeled_bbox(
|
|||
linestyle="solid",
|
||||
label_pos="top,left"
|
||||
):
|
||||
"""Utility drawing function to draw a box with an associated text label
|
||||
"""
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(bbox[0], bbox[1]),
|
||||
|
|
@ -83,32 +98,55 @@ def draw_parse_constraints(table, ax):
|
|||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
"""
|
||||
if table.parse_details:
|
||||
# Display a bbox per region
|
||||
for region_str in table.parse_details["table_regions"] or []:
|
||||
zone_constraints = {
|
||||
"region": "table_regions",
|
||||
"area": "table_areas",
|
||||
}
|
||||
for zone_name, zone_id in zone_constraints.items():
|
||||
# Display a bbox per region / area
|
||||
for zone_str in table.parse_details[zone_id] or []:
|
||||
draw_labeled_bbox(
|
||||
ax, bbox_from_str(region_str),
|
||||
"region: ({region_str})".format(region_str=region_str),
|
||||
ax, bbox_from_str(zone_str),
|
||||
"{zone_name}: ({zone_str})".format(
|
||||
zone_name=zone_name,
|
||||
zone_str=zone_str
|
||||
),
|
||||
color="purple",
|
||||
linestyle="dotted",
|
||||
linewidth=1,
|
||||
label_pos="bottom,right"
|
||||
)
|
||||
# Display a bbox per area
|
||||
for area_str in table.parse_details["table_areas"] or []:
|
||||
draw_labeled_bbox(
|
||||
ax, bbox_from_str(area_str),
|
||||
"area: ({area_str})".format(area_str=area_str),
|
||||
color="pink",
|
||||
linestyle="dotted",
|
||||
linewidth=1,
|
||||
label_pos="bottom,right"
|
||||
|
||||
|
||||
def draw_text(table, ax):
|
||||
"""Draw text, horizontal in blue, vertical in red
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : camelot.core.Table
|
||||
ax : matplotlib.axes.Axes (optional)
|
||||
|
||||
ax : matplotlib.axes.Axes
|
||||
|
||||
"""
|
||||
bbox = bbox_from_textlines(table.textlines)
|
||||
for t in table.textlines:
|
||||
color = "red" if isinstance(t, LTTextLineVertical) else "blue"
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t.x0, t.y0),
|
||||
t.x1 - t.x0,
|
||||
t.y1 - t.y0,
|
||||
color=color,
|
||||
alpha=0.2
|
||||
)
|
||||
)
|
||||
extend_axe_lim(ax, bbox)
|
||||
|
||||
|
||||
def prepare_plot(table, ax=None, to_pdf_scale=True):
|
||||
|
|
@ -188,20 +226,7 @@ class PlotMethods():
|
|||
|
||||
"""
|
||||
ax = prepare_plot(table, ax)
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]),
|
||||
t[2] - t[0],
|
||||
t[3] - t[1],
|
||||
alpha=0.5
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
draw_text(table, ax)
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -255,18 +280,8 @@ class PlotMethods():
|
|||
else:
|
||||
table_bbox = {table._bbox: None}
|
||||
|
||||
xs, ys = [], []
|
||||
if not _FOR_LATTICE:
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||
color="blue",
|
||||
alpha=0.5
|
||||
)
|
||||
)
|
||||
draw_text(table, ax)
|
||||
|
||||
for t in table_bbox.keys():
|
||||
ax.add_patch(
|
||||
|
|
@ -276,10 +291,8 @@ class PlotMethods():
|
|||
)
|
||||
)
|
||||
if not _FOR_LATTICE:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
extend_axe_lim(ax, t)
|
||||
|
||||
return ax.get_figure()
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -297,19 +310,7 @@ class PlotMethods():
|
|||
|
||||
"""
|
||||
ax = prepare_plot(table, ax)
|
||||
xs, ys = [], []
|
||||
for t in table._text:
|
||||
xs.extend([t[0], t[2]])
|
||||
ys.extend([t[1], t[3]])
|
||||
ax.add_patch(
|
||||
patches.Rectangle(
|
||||
(t[0], t[1]), t[2] - t[0], t[3] - t[1],
|
||||
color="blue",
|
||||
alpha=0.2
|
||||
)
|
||||
)
|
||||
ax.set_xlim(min(xs) - 10, max(xs) + 10)
|
||||
ax.set_ylim(min(ys) - 10, max(ys) + 10)
|
||||
draw_text(table, ax)
|
||||
|
||||
if table.flavor == "hybrid":
|
||||
for network in table.parse_details["network_searches"]:
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 105 KiB After Width: | Height: | Size: 103 KiB |
|
Before Width: | Height: | Size: 100 KiB After Width: | Height: | Size: 88 KiB |
|
Before Width: | Height: | Size: 100 KiB After Width: | Height: | Size: 90 KiB |
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 101 KiB |
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 101 KiB |
|
Before Width: | Height: | Size: 113 KiB After Width: | Height: | Size: 111 KiB |
|
Before Width: | Height: | Size: 71 KiB After Width: | Height: | Size: 59 KiB |