Prep for vertical text improvements

plot.text shows vertical text in red _generate_columns_and_rows split between hybrid and stream
2020-04-28 11:46:12 -07:00 · 2020-04-28 11:46:12 -07:00 · 6add19ae27
parent c51c24a416
commit 6add19ae27
12 changed files with 250 additions and 151 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -171,11 +171,10 @@ class TextAlignments():
            idx_insert = None
            if idx_closest is None:
                idx_insert = 0
-            elif np.isclose(
+            # Note: np.isclose is slow!
-                alignment_array[idx_closest].coord,
+            elif coord - 0.5 < \
-                coord,
+                    alignment_array[idx_closest].coord < \
-                atol=0.5
+                    coord + 0.5:
            ):
                self._update_alignment(
                    alignment_array[idx_closest],
                    coord,
@ -461,6 +460,7 @@ class Table():
        self._image_path = None  # Temporary file to hold an image of the pdf
        self._text = []      # List of text box coordinates
        self.textlines = []  # List of actual textlines on the page
    def __repr__(self):
        return "<{} shape={}>".format(self.__class__.__name__, self.shape)
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -8,13 +8,11 @@ import pandas as pd
 from ..utils import (
    bbox_from_str,
    bbox_from_textlines,
    compute_accuracy,
    compute_whitespace,
    get_text_objects,
    get_table_index,
    text_in_bbox,
    text_in_bbox_per_axis,
 )
 from ..core import Table
@ -243,6 +241,7 @@ class BaseParser():
            [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
        table.textlines = self.horizontal_text + self.vertical_text
 class TextBaseParser(BaseParser):
@ -454,84 +453,6 @@ class TextBaseParser(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -6,6 +6,7 @@ from __future__ import division
 import copy
 import math
 import numpy as np
 import warnings
 from .base import TextBaseParser
 from ..core import (
@ -20,7 +21,8 @@ from ..utils import (
    text_in_bbox,
    bbox_from_textlines,
    distance_tl_to_bbox,
-    find_columns_coordinates
+    find_columns_coordinates,
    text_in_bbox_per_axis,
 )
 # maximum number of columns over which a header can spread
@ -574,3 +576,91 @@ class Hybrid(TextBaseParser):
                lambda tl: tl not in textlines_processed,
                textlines
            ))
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        all_tls = list(
            filter(
                lambda tl: len(tl.get_text().strip()) > 0,
                self.t_bbox["horizontal"]  # + self.t_bbox["vertical"]
            )
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            all_tls
        )
        # FRHTODO:
        # This algorithm takes the horizontal textlines in the bbox, and groups
        # them into rows based on their bottom y0.
        # That's wrong: it misses the vertical items, and misses out on all
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -2,11 +2,15 @@
 from __future__ import division
 import warnings
 from .base import TextBaseParser
 from ..core import TextEdges
 from ..utils import (
    bbox_from_str,
-    text_in_bbox
+    bbox_from_textlines,
    text_in_bbox,
    text_in_bbox_per_axis,
 )
@ -124,3 +128,86 @@ class Stream(TextBaseParser):
            for area_str in self.table_areas:
                table_bbox[bbox_from_str(area_str)] = None
        self.table_bbox = table_bbox
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        # FRHTODO:
        # This algorithm takes the horizontal textlines in the bbox, and groups
        # them into rows based on their bottom y0.
        # That's wrong: it misses the vertical items, and misses out on all
        # the alignment identification work we've done earlier.
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -8,7 +8,20 @@ except ImportError:
 else:
    _HAS_MPL = True
-from .utils import (bbox_from_str, get_textline_coords)
+from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
 from pdfminer.layout import (
    LTTextLineVertical,
 )
 def extend_axe_lim(ax, bbox, margin=10):
    """Ensure the ax limits include the input bbox
    """
    x0, x1 = ax.get_xlim()
    y0, y1 = ax.get_ylim()
    ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
    ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))
 def draw_labeled_bbox(
@ -17,6 +30,8 @@ def draw_labeled_bbox(
    linestyle="solid",
    label_pos="top,left"
 ):
    """Utility drawing function to draw a box with an associated text label
    """
    ax.add_patch(
        patches.Rectangle(
            (bbox[0], bbox[1]),
@ -83,32 +98,55 @@ def draw_parse_constraints(table, ax):
    Parameters
    ----------
    table : camelot.core.Table
    ax : matplotlib.axes.Axes (optional)
-    ax : matplotlib.axes.Axes
+    ax : matplotlib.axes.Axes (optional)
    """
    if table.parse_details:
-        # Display a bbox per region
+        zone_constraints = {
-        for region_str in table.parse_details["table_regions"] or []:
+            "region": "table_regions",
            "area": "table_areas",
        }
        for zone_name, zone_id in zone_constraints.items():
            # Display a bbox per region / area
            for zone_str in table.parse_details[zone_id] or []:
                draw_labeled_bbox(
-                ax, bbox_from_str(region_str),
+                    ax, bbox_from_str(zone_str),
-                "region: ({region_str})".format(region_str=region_str),
+                    "{zone_name}: ({zone_str})".format(
                        zone_name=zone_name,
                        zone_str=zone_str
                    ),
                    color="purple",
                    linestyle="dotted",
                    linewidth=1,
                    label_pos="bottom,right"
                )
-        # Display a bbox per area
+
-        for area_str in table.parse_details["table_areas"] or []:
+
-            draw_labeled_bbox(
+def draw_text(table, ax):
-                ax, bbox_from_str(area_str),
+    """Draw text, horizontal in blue, vertical in red
-                "area: ({area_str})".format(area_str=area_str),
+
-                color="pink",
+    Parameters
-                linestyle="dotted",
+    ----------
-                linewidth=1,
+    table : camelot.core.Table
-                label_pos="bottom,right"
+    ax : matplotlib.axes.Axes (optional)
    ax : matplotlib.axes.Axes
    """
    bbox = bbox_from_textlines(table.textlines)
    for t in table.textlines:
        color = "red" if isinstance(t, LTTextLineVertical) else "blue"
        ax.add_patch(
            patches.Rectangle(
                    (t.x0, t.y0),
                    t.x1 - t.x0,
                    t.y1 - t.y0,
                    color=color,
                    alpha=0.2
                )
            )
    extend_axe_lim(ax, bbox)
 def prepare_plot(table, ax=None, to_pdf_scale=True):
@ -188,20 +226,7 @@ class PlotMethods():
        """
        ax = prepare_plot(table, ax)
-        xs, ys = [], []
+        draw_text(table, ax)
        for t in table._text:
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
            ax.add_patch(
                patches.Rectangle(
                        (t[0], t[1]),
                        t[2] - t[0],
                        t[3] - t[1],
                        alpha=0.5
                    )
                )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        return ax.get_figure()
    @staticmethod
@ -255,18 +280,8 @@ class PlotMethods():
        else:
            table_bbox = {table._bbox: None}
        xs, ys = [], []
        if not _FOR_LATTICE:
-            for t in table._text:
+            draw_text(table, ax)
                xs.extend([t[0], t[2]])
                ys.extend([t[1], t[3]])
                ax.add_patch(
                    patches.Rectangle(
                        (t[0], t[1]), t[2] - t[0], t[3] - t[1],
                        color="blue",
                        alpha=0.5
                    )
                )
        for t in table_bbox.keys():
            ax.add_patch(
@ -276,10 +291,8 @@ class PlotMethods():
                )
            )
            if not _FOR_LATTICE:
-                xs.extend([t[0], t[2]])
+                extend_axe_lim(ax, t)
-                ys.extend([t[1], t[3]])
+
                ax.set_xlim(min(xs) - 10, max(xs) + 10)
                ax.set_ylim(min(ys) - 10, max(ys) + 10)
        return ax.get_figure()
    @staticmethod
@ -297,19 +310,7 @@ class PlotMethods():
        """
        ax = prepare_plot(table, ax)
-        xs, ys = [], []
+        draw_text(table, ax)
        for t in table._text:
            xs.extend([t[0], t[2]])
            ys.extend([t[1], t[3]])
            ax.add_patch(
                patches.Rectangle(
                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
                    color="blue",
                    alpha=0.2
                )
            )
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        if table.flavor == "hybrid":
            for network in table.parse_details["network_searches"]:
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_contour_plot.png
+++ b/tests/files/baseline_plots/test_stream_contour_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png
--- a/tests/files/baseline_plots/test_text_plot.png
+++ b/tests/files/baseline_plots/test_text_plot.png