Prep for vertical text improvements

plot.text shows vertical text in red _generate_columns_and_rows split between hybrid and stream
2020-04-28 11:46:12 -07:00 · 2020-04-28 11:46:12 -07:00 · 6add19ae27
parent c51c24a416
commit 6add19ae27
12 changed files with 250 additions and 151 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -171,11 +171,10 @@ class TextAlignments():
            idx_insert = None
            if idx_closest is None:
                idx_insert = 0
-            elif np.isclose(
-                alignment_array[idx_closest].coord,
-                coord,
-                atol=0.5
-            ):
+            # Note: np.isclose is slow!
+            elif coord - 0.5 < \
+                    alignment_array[idx_closest].coord < \
+                    coord + 0.5:
                self._update_alignment(
                    alignment_array[idx_closest],
                    coord,
@ -461,6 +460,7 @@ class Table():
        self._image_path = None  # Temporary file to hold an image of the pdf

        self._text = []      # List of text box coordinates
+        self.textlines = []  # List of actual textlines on the page

    def __repr__(self):
        return "<{} shape={}>".format(self.__class__.__name__, self.shape)
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -8,13 +8,11 @@ import pandas as pd

 from ..utils import (
    bbox_from_str,
-    bbox_from_textlines,
    compute_accuracy,
    compute_whitespace,
    get_text_objects,
    get_table_index,
    text_in_bbox,
-    text_in_bbox_per_axis,
 )
 from ..core import Table

@ -243,6 +241,7 @@ class BaseParser():
            [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
+        table.textlines = self.horizontal_text + self.vertical_text


 class TextBaseParser(BaseParser):
@ -454,84 +453,6 @@ class TextBaseParser(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")

-    def _generate_columns_and_rows(self, bbox, table_idx):
-        # select elements which lie within table_bbox
-        self.t_bbox = text_in_bbox_per_axis(
-            bbox,
-            self.horizontal_text,
-            self.vertical_text
-        )
-
-        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
-            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
-        )
-        rows_grouped = self._group_rows(
-            self.t_bbox["horizontal"], row_tol=self.row_tol)
-        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
-        elements = [len(r) for r in rows_grouped]
-
-        if self.columns is not None and self.columns[table_idx] != "":
-            # user has to input boundary columns too
-            # take (0, pdf_width) by default
-            # similar to else condition
-            # len can't be 1
-            cols = self.columns[table_idx].split(",")
-            cols = [float(c) for c in cols]
-            cols.insert(0, text_x_min)
-            cols.append(text_x_max)
-            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
-        else:
-            # calculate mode of the list of number of elements in
-            # each row to guess the number of columns
-            ncols = max(set(elements), key=elements.count)
-            if ncols == 1:
-                # if mode is 1, the page usually contains not tables
-                # but there can be cases where the list can be skewed,
-                # try to remove all 1s from list in this case and
-                # see if the list contains elements, if yes, then use
-                # the mode after removing 1s
-                elements = list(filter(lambda x: x != 1, elements))
-                if elements:
-                    ncols = max(set(elements), key=elements.count)
-                else:
-                    warnings.warn(
-                        "No tables found in table area {}"
-                        .format(table_idx + 1)
-                    )
-            cols = [
-                (t.x0, t.x1)
-                for r in rows_grouped
-                if len(r) == ncols
-                for t in r
-            ]
-            cols = self._merge_columns(
-                sorted(cols),
-                column_tol=self.column_tol
-            )
-            inner_text = []
-            for i in range(1, len(cols)):
-                left = cols[i - 1][1]
-                right = cols[i][0]
-                inner_text.extend(
-                    [
-                        t
-                        for direction in self.t_bbox
-                        for t in self.t_bbox[direction]
-                        if t.x0 > left and t.x1 < right
-                    ]
-                )
-            outer_text = [
-                t
-                for direction in self.t_bbox
-                for t in self.t_bbox[direction]
-                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
-            ]
-            inner_text.extend(outer_text)
-            cols = self._add_columns(cols, inner_text, self.row_tol)
-            cols = self._join_columns(cols, text_x_min, text_x_max)
-
-        return cols, rows, None, None
-
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -6,6 +6,7 @@ from __future__ import division
 import copy
 import math
 import numpy as np
+import warnings

 from .base import TextBaseParser
 from ..core import (
@ -20,7 +21,8 @@ from ..utils import (
    text_in_bbox,
    bbox_from_textlines,
    distance_tl_to_bbox,
-    find_columns_coordinates
+    find_columns_coordinates,
+    text_in_bbox_per_axis,
 )

 # maximum number of columns over which a header can spread
@ -574,3 +576,91 @@ class Hybrid(TextBaseParser):
                lambda tl: tl not in textlines_processed,
                textlines
            ))
+
+    def _generate_columns_and_rows(self, bbox, table_idx):
+        # select elements which lie within table_bbox
+        self.t_bbox = text_in_bbox_per_axis(
+            bbox,
+            self.horizontal_text,
+            self.vertical_text
+        )
+
+        all_tls = list(
+            filter(
+                lambda tl: len(tl.get_text().strip()) > 0,
+                self.t_bbox["horizontal"]  # + self.t_bbox["vertical"]
+            )
+        )
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
+            all_tls
+        )
+        # FRHTODO:
+        # This algorithm takes the horizontal textlines in the bbox, and groups
+        # them into rows based on their bottom y0.
+        # That's wrong: it misses the vertical items, and misses out on all
+        # the alignment identification work we've done earlier.
+        rows_grouped = self._group_rows(all_tls, row_tol=self.row_tol)
+        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
+        elements = [len(r) for r in rows_grouped]
+
+        if self.columns is not None and self.columns[table_idx] != "":
+            # user has to input boundary columns too
+            # take (0, pdf_width) by default
+            # similar to else condition
+            # len can't be 1
+            cols = self.columns[table_idx].split(",")
+            cols = [float(c) for c in cols]
+            cols.insert(0, text_x_min)
+            cols.append(text_x_max)
+            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        else:
+            # calculate mode of the list of number of elements in
+            # each row to guess the number of columns
+            ncols = max(set(elements), key=elements.count)
+            if ncols == 1:
+                # if mode is 1, the page usually contains not tables
+                # but there can be cases where the list can be skewed,
+                # try to remove all 1s from list in this case and
+                # see if the list contains elements, if yes, then use
+                # the mode after removing 1s
+                elements = list(filter(lambda x: x != 1, elements))
+                if elements:
+                    ncols = max(set(elements), key=elements.count)
+                else:
+                    warnings.warn(
+                        "No tables found in table area {}"
+                        .format(table_idx + 1)
+                    )
+            cols = [
+                (t.x0, t.x1)
+                for r in rows_grouped
+                if len(r) == ncols
+                for t in r
+            ]
+            cols = self._merge_columns(
+                sorted(cols),
+                column_tol=self.column_tol
+            )
+            inner_text = []
+            for i in range(1, len(cols)):
+                left = cols[i - 1][1]
+                right = cols[i][0]
+                inner_text.extend(
+                    [
+                        t
+                        for direction in self.t_bbox
+                        for t in self.t_bbox[direction]
+                        if t.x0 > left and t.x1 < right
+                    ]
+                )
+            outer_text = [
+                t
+                for direction in self.t_bbox
+                for t in self.t_bbox[direction]
+                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
+            ]
+            inner_text.extend(outer_text)
+            cols = self._add_columns(cols, inner_text, self.row_tol)
+            cols = self._join_columns(cols, text_x_min, text_x_max)
+
+        return cols, rows, None, None
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -2,11 +2,15 @@

 from __future__ import division

+import warnings
+
 from .base import TextBaseParser
 from ..core import TextEdges
 from ..utils import (
    bbox_from_str,
-    text_in_bbox
+    bbox_from_textlines,
+    text_in_bbox,
+    text_in_bbox_per_axis,
 )


@ -124,3 +128,86 @@ class Stream(TextBaseParser):
            for area_str in self.table_areas:
                table_bbox[bbox_from_str(area_str)] = None
        self.table_bbox = table_bbox
+
+    def _generate_columns_and_rows(self, bbox, table_idx):
+        # select elements which lie within table_bbox
+        self.t_bbox = text_in_bbox_per_axis(
+            bbox,
+            self.horizontal_text,
+            self.vertical_text
+        )
+
+        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
+            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
+        )
+        # FRHTODO:
+        # This algorithm takes the horizontal textlines in the bbox, and groups
+        # them into rows based on their bottom y0.
+        # That's wrong: it misses the vertical items, and misses out on all
+        # the alignment identification work we've done earlier.
+        rows_grouped = self._group_rows(
+            self.t_bbox["horizontal"], row_tol=self.row_tol)
+        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
+        elements = [len(r) for r in rows_grouped]
+
+        if self.columns is not None and self.columns[table_idx] != "":
+            # user has to input boundary columns too
+            # take (0, pdf_width) by default
+            # similar to else condition
+            # len can't be 1
+            cols = self.columns[table_idx].split(",")
+            cols = [float(c) for c in cols]
+            cols.insert(0, text_x_min)
+            cols.append(text_x_max)
+            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
+        else:
+            # calculate mode of the list of number of elements in
+            # each row to guess the number of columns
+            ncols = max(set(elements), key=elements.count)
+            if ncols == 1:
+                # if mode is 1, the page usually contains not tables
+                # but there can be cases where the list can be skewed,
+                # try to remove all 1s from list in this case and
+                # see if the list contains elements, if yes, then use
+                # the mode after removing 1s
+                elements = list(filter(lambda x: x != 1, elements))
+                if elements:
+                    ncols = max(set(elements), key=elements.count)
+                else:
+                    warnings.warn(
+                        "No tables found in table area {}"
+                        .format(table_idx + 1)
+                    )
+            cols = [
+                (t.x0, t.x1)
+                for r in rows_grouped
+                if len(r) == ncols
+                for t in r
+            ]
+            cols = self._merge_columns(
+                sorted(cols),
+                column_tol=self.column_tol
+            )
+            inner_text = []
+            for i in range(1, len(cols)):
+                left = cols[i - 1][1]
+                right = cols[i][0]
+                inner_text.extend(
+                    [
+                        t
+                        for direction in self.t_bbox
+                        for t in self.t_bbox[direction]
+                        if t.x0 > left and t.x1 < right
+                    ]
+                )
+            outer_text = [
+                t
+                for direction in self.t_bbox
+                for t in self.t_bbox[direction]
+                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
+            ]
+            inner_text.extend(outer_text)
+            cols = self._add_columns(cols, inner_text, self.row_tol)
+            cols = self._join_columns(cols, text_x_min, text_x_max)
+
+        return cols, rows, None, None
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -8,7 +8,20 @@ except ImportError:
 else:
    _HAS_MPL = True

-from .utils import (bbox_from_str, get_textline_coords)
+from .utils import (bbox_from_str, bbox_from_textlines, get_textline_coords)
+
+from pdfminer.layout import (
+    LTTextLineVertical,
+)
+
+
+def extend_axe_lim(ax, bbox, margin=10):
+    """Ensure the ax limits include the input bbox
+    """
+    x0, x1 = ax.get_xlim()
+    y0, y1 = ax.get_ylim()
+    ax.set_xlim(min(x0, bbox[0] - margin), max(x1, bbox[2] + margin))
+    ax.set_ylim(min(y0, bbox[1] - margin), max(y1, bbox[3] + margin))


 def draw_labeled_bbox(
@ -17,6 +30,8 @@ def draw_labeled_bbox(
    linestyle="solid",
    label_pos="top,left"
 ):
+    """Utility drawing function to draw a box with an associated text label
+    """
    ax.add_patch(
        patches.Rectangle(
            (bbox[0], bbox[1]),
@ -83,32 +98,55 @@ def draw_parse_constraints(table, ax):
    Parameters
    ----------
    table : camelot.core.Table
-    ax : matplotlib.axes.Axes (optional)

-    ax : matplotlib.axes.Axes
+    ax : matplotlib.axes.Axes (optional)

    """
    if table.parse_details:
-        # Display a bbox per region
-        for region_str in table.parse_details["table_regions"] or []:
+        zone_constraints = {
+            "region": "table_regions",
+            "area": "table_areas",
+        }
+        for zone_name, zone_id in zone_constraints.items():
+            # Display a bbox per region / area
+            for zone_str in table.parse_details[zone_id] or []:
                draw_labeled_bbox(
-                ax, bbox_from_str(region_str),
-                "region: ({region_str})".format(region_str=region_str),
+                    ax, bbox_from_str(zone_str),
+                    "{zone_name}: ({zone_str})".format(
+                        zone_name=zone_name,
+                        zone_str=zone_str
+                    ),
                    color="purple",
                    linestyle="dotted",
                    linewidth=1,
                    label_pos="bottom,right"
                )
-        # Display a bbox per area
-        for area_str in table.parse_details["table_areas"] or []:
-            draw_labeled_bbox(
-                ax, bbox_from_str(area_str),
-                "area: ({area_str})".format(area_str=area_str),
-                color="pink",
-                linestyle="dotted",
-                linewidth=1,
-                label_pos="bottom,right"
+
+
+def draw_text(table, ax):
+    """Draw text, horizontal in blue, vertical in red
+
+    Parameters
+    ----------
+    table : camelot.core.Table
+    ax : matplotlib.axes.Axes (optional)
+
+    ax : matplotlib.axes.Axes
+
+    """
+    bbox = bbox_from_textlines(table.textlines)
+    for t in table.textlines:
+        color = "red" if isinstance(t, LTTextLineVertical) else "blue"
+        ax.add_patch(
+            patches.Rectangle(
+                    (t.x0, t.y0),
+                    t.x1 - t.x0,
+                    t.y1 - t.y0,
+                    color=color,
+                    alpha=0.2
                )
+            )
+    extend_axe_lim(ax, bbox)


 def prepare_plot(table, ax=None, to_pdf_scale=True):
@ -188,20 +226,7 @@ class PlotMethods():

        """
        ax = prepare_plot(table, ax)
-        xs, ys = [], []
-        for t in table._text:
-            xs.extend([t[0], t[2]])
-            ys.extend([t[1], t[3]])
-            ax.add_patch(
-                patches.Rectangle(
-                        (t[0], t[1]),
-                        t[2] - t[0],
-                        t[3] - t[1],
-                        alpha=0.5
-                    )
-                )
-        ax.set_xlim(min(xs) - 10, max(xs) + 10)
-        ax.set_ylim(min(ys) - 10, max(ys) + 10)
+        draw_text(table, ax)
        return ax.get_figure()

    @staticmethod
@ -255,18 +280,8 @@ class PlotMethods():
        else:
            table_bbox = {table._bbox: None}

-        xs, ys = [], []
        if not _FOR_LATTICE:
-            for t in table._text:
-                xs.extend([t[0], t[2]])
-                ys.extend([t[1], t[3]])
-                ax.add_patch(
-                    patches.Rectangle(
-                        (t[0], t[1]), t[2] - t[0], t[3] - t[1],
-                        color="blue",
-                        alpha=0.5
-                    )
-                )
+            draw_text(table, ax)

        for t in table_bbox.keys():
            ax.add_patch(
@ -276,10 +291,8 @@ class PlotMethods():
                )
            )
            if not _FOR_LATTICE:
-                xs.extend([t[0], t[2]])
-                ys.extend([t[1], t[3]])
-                ax.set_xlim(min(xs) - 10, max(xs) + 10)
-                ax.set_ylim(min(ys) - 10, max(ys) + 10)
+                extend_axe_lim(ax, t)
+
        return ax.get_figure()

    @staticmethod
@ -297,19 +310,7 @@ class PlotMethods():

        """
        ax = prepare_plot(table, ax)
-        xs, ys = [], []
-        for t in table._text:
-            xs.extend([t[0], t[2]])
-            ys.extend([t[1], t[3]])
-            ax.add_patch(
-                patches.Rectangle(
-                    (t[0], t[1]), t[2] - t[0], t[3] - t[1],
-                    color="blue",
-                    alpha=0.2
-                )
-            )
-        ax.set_xlim(min(xs) - 10, max(xs) + 10)
-        ax.set_ylim(min(ys) - 10, max(ys) + 10)
+        draw_text(table, ax)

        if table.flavor == "hybrid":
            for network in table.parse_details["network_searches"]:
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_areas_text_plot.png
--- a/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_table_regions_textedge_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/files/baseline_plots/test_stream_contour_plot.png
+++ b/tests/files/baseline_plots/test_stream_contour_plot.png
--- a/tests/files/baseline_plots/test_stream_textedge_plot.png
+++ b/tests/files/baseline_plots/test_stream_textedge_plot.png
--- a/tests/files/baseline_plots/test_text_plot.png
+++ b/tests/files/baseline_plots/test_text_plot.png