Improve hybrid plotting

* plot info passed through debug_info * display each text edge
2020-04-20 16:54:06 -07:00 · 2020-04-20 16:54:06 -07:00 · 1ccaa0630d
parent e0e3ff4e07
commit 1ccaa0630d
9 changed files with 118 additions and 70 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -40,10 +40,13 @@ class PDFHandler():
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
+    debug : bool, optional (default: False)
+        Whether the parser should store debug information during parsing.

    """

-    def __init__(self, filepath, pages="1", password=None):
+    def __init__(self, filepath, pages="1", password=None, debug=False):
+        self.debug = debug
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
@ -193,7 +196,7 @@ class PDFHandler():
        tables = []

        parser_obj = PARSERS[flavor]
-        parser = parser_obj(**kwargs)
+        parser = parser_obj(debug=self.debug, **kwargs)

        # Read the layouts/dimensions of each of the pages we need to
        # parse. This might require creating a temporary .pdf.
@ -204,8 +207,8 @@ class PDFHandler():
            )
            parser.prepare_page_parse(source_file, layout, dimensions,
                                      page_idx, layout_kwargs)
-            rootname = os.path.basename(parser.rootname)
            if not suppress_stdout:
+                rootname = os.path.basename(parser.rootname)
                logger.info(
                    "Processing {rootname}".format(rootname=rootname))
            t = parser.extract_tables()
--- a/camelot/io.py
+++ b/camelot/io.py
@ -13,6 +13,7 @@ def read_pdf(
    flavor="lattice",
    suppress_stdout=False,
    layout_kwargs=None,
+    debug=False,
    **kwargs
 ):
    """Read PDF and return extracted tables.
@ -110,7 +111,7 @@ def read_pdf(
            warnings.simplefilter("ignore")

        validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -5,7 +5,9 @@ import warnings

 from ..utils import (
    get_text_objects,
-    get_table_index
+    get_table_index,
+    text_in_bbox,
+    bbox_from_str,
 )
 from ..core import Table

@ -65,7 +67,39 @@ class BaseParser(object):
            self.debug_info["table_regions"] = self.table_regions
            self.debug_info["table_areas"] = self.table_areas

+    def _apply_regions_filter(self, textlines):
+        """If regions have been specified, filter textlines to these regions.
+
+        Parameters
+        ----------
+        textlines : list
+            list of textlines to be filtered
+
+        Returns
+        -------
+        filtered_textlines : list of textlines within the regions specified
+
+        """
+        filtered_textlines = []
+        if self.table_regions is None:
+            filtered_textlines.extend(textlines)
+        else:
+            for region_str in self.table_regions:
+                region_text = text_in_bbox(
+                    bbox_from_str(region_str),
+                    textlines
+                )
+                filtered_textlines.extend(region_text)
+        return filtered_textlines
+
    def _document_has_no_text(self):
+        """Detects image only documents and warns.
+
+        Returns
+        -------
+        has_no_text : bool
+            Whether the document doesn't have any text at all.
+        """
        if not self.horizontal_text:
            rootname = os.path.basename(self.rootname)
            if self.images:
@ -81,6 +115,7 @@ class BaseParser(object):
            return True
        return False

+    def _initialize_new_table(self, table_idx, cols, rows):
        """Initialize new table object, ready to be populated

        Parameters
@ -97,7 +132,6 @@ class BaseParser(object):
        table : camelot.core.Table

        """
-    def _initialize_new_table(self, table_idx, cols, rows):
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -2,7 +2,7 @@
 from __future__ import division

 import numpy as np
-
+import copy
 import warnings

 from .base import BaseParser
@ -459,7 +459,6 @@ class TextEdges2(object):
        or horizontally. There needs to be connections across both
        dimensions.
        """
-        singleton_textlines = []
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
@ -471,7 +470,6 @@ class TextEdges2(object):
                        tl = te.textlines[i]
                        alignments = self._textlines_alignments[tl]
                        if alignments.max_h() <= 1 or alignments.max_v() <= 1:
-                            singleton_textlines.append(tl)
                            del te.textlines[i]
                            removed_singletons = True
            self._textlines_alignments = {}
@ -612,33 +610,27 @@ class TextEdges2(object):
        self._register_all_text_lines(textlines)
        self._compute_alignment_counts()

-    def plotFRHAlignments(self, table, plt):
+    def plot_alignments(self, ax):
        """Displays a visualization of the alignments as currently computed.
        """
-        fig = plt.figure()
-        ax = fig.add_subplot(111, aspect="equal")
-        img = table.get_pdf_image()
-        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
-
-        tls_by_alignment_score = sorted(
+        # FRHTODO: This is too busy and doesn't plot lines
+        most_aligned_tl = sorted(
            self._textlines_alignments.keys(),
            key=lambda textline:
            self._textlines_alignments[textline].alignment_score(),
            reverse=True
-        )
+        )[0]

-        for tl, alignments in self._textlines_alignments.items():
-            color = "red"
-            if tl == tls_by_alignment_score[0]:
-                color = "blue"
        ax.add_patch(
            patches.Rectangle(
-                    (tl.x0, tl.y0),
-                    tl.x1 - tl.x0, tl.y1 - tl.y0,
-                    color=color,
+                (most_aligned_tl.x0, most_aligned_tl.y0),
+                most_aligned_tl.x1 - most_aligned_tl.x0,
+                most_aligned_tl.y1 - most_aligned_tl.y0,
+                color="red",
                alpha=0.5
            )
        )
+        for tl, alignments in self._textlines_alignments.items():
            ax.text(
                tl.x0 - 5,
                tl.y0 - 5,
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
        edge_tol=50,
        row_tol=2,
        column_tol=0,
+        debug=False,
        **kwargs
    ):
        super().__init__(
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
+            debug=debug
        )
        self.columns = columns
        self._validate_columns()
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")

-    # FRHTODO: get debug_info to work again
-    def _generate_table_bbox(self, debug_info=None):
+    def _generate_table_bbox(self):
        if self.table_areas is not None:
            table_bbox = {}
            for area_str in self.table_areas:
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
            return

        all_textlines = self.horizontal_text + self.vertical_text
-        textlines = []
-        if self.table_regions is None:
-            textlines = all_textlines
-        else:
-            # filter text
-            for region_str in self.table_regions:
-                region_text = text_in_bbox(
-                    bbox_from_str(region_str),
-                    all_textlines
-                )
-                textlines.extend(region_text)
+        textlines = self._apply_regions_filter(all_textlines)

        textlines_processed = {}
        self.table_bbox = {}
-        if debug_info is not None:
-            debug_info_bbox_searches = []
-            debug_info["bboxes_searches"] = debug_info_bbox_searches
+        if self.debug_info is not None:
+            debug_info_edges_searches = []
+            self.debug_info["edges_searches"] = debug_info_edges_searches
+            debug_info_bboxes_searches = []
+            self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
        else:
-            debug_info_bbox_searches = None
+            debug_info_edges_searches = None
+            debug_info_bboxes_searches = None

        while True:
            self.textedges = TextEdges2()
            self.textedges.generate(textlines)
            self.textedges._remove_unconnected_edges()
+            if debug_info_edges_searches is not None:
+                # Preserve the current edge calculation for display debugging
+                debug_info_edges_searches.append(
+                    copy.deepcopy(self.textedges)
+                )
            bbox = self.textedges._build_bbox_candidate(
-                debug_info_bbox_searches
+                debug_info_bboxes_searches
            )
            if bbox is None:
                break
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
                average_tl_height
            )

-            if debug_info is not None:
-                debug_info["col_searches"].append({
+            if self.debug_info is not None:
+                if "col_searches" not in self.debug_info:
+                    self.debug_info["col_searches"] = []
+                self.debug_info["col_searches"].append({
                    "core_bbox": bbox,
                    "cols_anchors": cols_anchors,
                    "expanded_bbox": expanded_bbox
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):

        return table

-    def extract_tables(self, debug_info=None):
+    def extract_tables(self):
        if self._document_has_no_text():
            return []

        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
-        self._generate_table_bbox(debug_info)
+        self._generate_table_bbox()

        _tables = []
        # sort tables based on y-coord
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -164,6 +164,20 @@ class PlotMethods(object):
            ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

+    @staticmethod
+    def draw_pdf(table, ax):
+        """Draw the content of the table's source pdf into the passed subplot
+
+        Parameters
+        ----------
+        table : camelot.core.Table
+
+        fig : matplotlib.axes.Axes
+
+        """
+        img = table.get_pdf_image()
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
+
    @staticmethod
    def textedge(table):
        """Generates a plot for relevant textedges.
@ -179,6 +193,7 @@ class PlotMethods(object):
        """
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
+        PlotMethods.draw_pdf(table, ax)
        xs, ys = [], []
        for t in table._text:
            xs.extend([t[0], t[2]])
@ -193,11 +208,13 @@ class PlotMethods(object):
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)

+        if table.flavor == "hybrid":
+            # FRHTODO: Clean this up
+            table.debug_info["edges_searches"][0].plot_alignments(ax)
+        else:
            for te in table._textedges:
                ax.plot([te.x, te.x], [te.y0, te.y1])

-        img = table.get_pdf_image()
-        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig

    @staticmethod
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_grid_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_grid_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -129,5 +129,5 @@ def test_stream_textedge_plot():
    baseline_dir="files/baseline_plots", remove_text=True)
 def test_hybrid_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor='hybrid')
+    tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
    return camelot.plot(tables[0], kind='textedge')