Improve hybrid plotting

* plot info passed through debug_info * display each text edge
2020-04-20 16:54:06 -07:00 · 2020-04-20 16:54:06 -07:00 · 1ccaa0630d
parent e0e3ff4e07
commit 1ccaa0630d
9 changed files with 118 additions and 70 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -40,10 +40,13 @@ class PDFHandler():
        Example: '1,3,4' or '1,4-end' or 'all'.
    password : str, optional (default: None)
        Password for decryption.
    debug : bool, optional (default: False)
        Whether the parser should store debug information during parsing.
    """
-    def __init__(self, filepath, pages="1", password=None):
+    def __init__(self, filepath, pages="1", password=None, debug=False):
        self.debug = debug
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
@ -193,7 +196,7 @@ class PDFHandler():
        tables = []
        parser_obj = PARSERS[flavor]
-        parser = parser_obj(**kwargs)
+        parser = parser_obj(debug=self.debug, **kwargs)
        # Read the layouts/dimensions of each of the pages we need to
        # parse. This might require creating a temporary .pdf.
@ -204,8 +207,8 @@ class PDFHandler():
            )
            parser.prepare_page_parse(source_file, layout, dimensions,
                                      page_idx, layout_kwargs)
            rootname = os.path.basename(parser.rootname)
            if not suppress_stdout:
                rootname = os.path.basename(parser.rootname)
                logger.info(
                    "Processing {rootname}".format(rootname=rootname))
            t = parser.extract_tables()
--- a/camelot/io.py
+++ b/camelot/io.py
@ -13,6 +13,7 @@ def read_pdf(
    flavor="lattice",
    suppress_stdout=False,
    layout_kwargs=None,
    debug=False,
    **kwargs
 ):
    """Read PDF and return extracted tables.
@ -110,7 +111,7 @@ def read_pdf(
            warnings.simplefilter("ignore")
        validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, debug=debug)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -5,7 +5,9 @@ import warnings
 from ..utils import (
    get_text_objects,
-    get_table_index
+    get_table_index,
    text_in_bbox,
    bbox_from_str,
 )
 from ..core import Table
@ -65,7 +67,39 @@ class BaseParser(object):
            self.debug_info["table_regions"] = self.table_regions
            self.debug_info["table_areas"] = self.table_areas
    def _apply_regions_filter(self, textlines):
        """If regions have been specified, filter textlines to these regions.
        Parameters
        ----------
        textlines : list
            list of textlines to be filtered
        Returns
        -------
        filtered_textlines : list of textlines within the regions specified
        """
        filtered_textlines = []
        if self.table_regions is None:
            filtered_textlines.extend(textlines)
        else:
            for region_str in self.table_regions:
                region_text = text_in_bbox(
                    bbox_from_str(region_str),
                    textlines
                )
                filtered_textlines.extend(region_text)
        return filtered_textlines
    def _document_has_no_text(self):
        """Detects image only documents and warns.
        Returns
        -------
        has_no_text : bool
            Whether the document doesn't have any text at all.
        """
        if not self.horizontal_text:
            rootname = os.path.basename(self.rootname)
            if self.images:
@ -81,23 +115,23 @@ class BaseParser(object):
            return True
        return False
    """Initialize new table object, ready to be populated
    Parameters
    ----------
    table_idx : int
        Index of this table within the pdf page analyzed
    cols : list
        list of coordinate boundaries tuples (left, right)
    rows : list
        list of coordinate boundaries tuples (bottom, top)
    Returns
    -------
    table : camelot.core.Table
    """
    def _initialize_new_table(self, table_idx, cols, rows):
        """Initialize new table object, ready to be populated
        Parameters
        ----------
        table_idx : int
            Index of this table within the pdf page analyzed
        cols : list
            list of coordinate boundaries tuples (left, right)
        rows : list
            list of coordinate boundaries tuples (bottom, top)
        Returns
        -------
        table : camelot.core.Table
        """
        table = Table(cols, rows)
        table.page = self.page
        table.order = table_idx + 1
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -2,7 +2,7 @@
 from __future__ import division
 import numpy as np
-
+import copy
 import warnings
 from .base import BaseParser
@ -459,7 +459,6 @@ class TextEdges2(object):
        or horizontally. There needs to be connections across both
        dimensions.
        """
        singleton_textlines = []
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
@ -471,7 +470,6 @@ class TextEdges2(object):
                        tl = te.textlines[i]
                        alignments = self._textlines_alignments[tl]
                        if alignments.max_h() <= 1 or alignments.max_v() <= 1:
                            singleton_textlines.append(tl)
                            del te.textlines[i]
                            removed_singletons = True
            self._textlines_alignments = {}
@ -612,33 +610,27 @@ class TextEdges2(object):
        self._register_all_text_lines(textlines)
        self._compute_alignment_counts()
-    def plotFRHAlignments(self, table, plt):
+    def plot_alignments(self, ax):
        """Displays a visualization of the alignments as currently computed.
        """
-        fig = plt.figure()
+        # FRHTODO: This is too busy and doesn't plot lines
-        ax = fig.add_subplot(111, aspect="equal")
+        most_aligned_tl = sorted(
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        tls_by_alignment_score = sorted(
            self._textlines_alignments.keys(),
            key=lambda textline:
            self._textlines_alignments[textline].alignment_score(),
            reverse=True
-        )
+        )[0]
-        for tl, alignments in self._textlines_alignments.items():
+        ax.add_patch(
-            color = "red"
+            patches.Rectangle(
-            if tl == tls_by_alignment_score[0]:
+                (most_aligned_tl.x0, most_aligned_tl.y0),
-                color = "blue"
+                most_aligned_tl.x1 - most_aligned_tl.x0,
-            ax.add_patch(
+                most_aligned_tl.y1 - most_aligned_tl.y0,
-                patches.Rectangle(
+                color="red",
-                    (tl.x0, tl.y0),
+                alpha=0.5
                    tl.x1 - tl.x0, tl.y1 - tl.y0,
                    color=color,
                    alpha=0.5
                )
            )
        )
        for tl, alignments in self._textlines_alignments.items():
            ax.text(
                tl.x0 - 5,
                tl.y0 - 5,
@ -749,6 +741,7 @@ class Hybrid(BaseParser):
        edge_tol=50,
        row_tol=2,
        column_tol=0,
        debug=False,
        **kwargs
    ):
        super().__init__(
@ -758,6 +751,7 @@ class Hybrid(BaseParser):
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
            debug=debug
        )
        self.columns = columns
        self._validate_columns()
@ -971,8 +965,7 @@ class Hybrid(BaseParser):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
-    # FRHTODO: get debug_info to work again
+    def _generate_table_bbox(self):
    def _generate_table_bbox(self, debug_info=None):
        if self.table_areas is not None:
            table_bbox = {}
            for area_str in self.table_areas:
@ -981,32 +974,30 @@ class Hybrid(BaseParser):
            return
        all_textlines = self.horizontal_text + self.vertical_text
-        textlines = []
+        textlines = self._apply_regions_filter(all_textlines)
        if self.table_regions is None:
            textlines = all_textlines
        else:
            # filter text
            for region_str in self.table_regions:
                region_text = text_in_bbox(
                    bbox_from_str(region_str),
                    all_textlines
                )
                textlines.extend(region_text)
        textlines_processed = {}
        self.table_bbox = {}
-        if debug_info is not None:
+        if self.debug_info is not None:
-            debug_info_bbox_searches = []
+            debug_info_edges_searches = []
-            debug_info["bboxes_searches"] = debug_info_bbox_searches
+            self.debug_info["edges_searches"] = debug_info_edges_searches
            debug_info_bboxes_searches = []
            self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
        else:
-            debug_info_bbox_searches = None
+            debug_info_edges_searches = None
            debug_info_bboxes_searches = None
        while True:
            self.textedges = TextEdges2()
            self.textedges.generate(textlines)
            self.textedges._remove_unconnected_edges()
            if debug_info_edges_searches is not None:
                # Preserve the current edge calculation for display debugging
                debug_info_edges_searches.append(
                    copy.deepcopy(self.textedges)
                )
            bbox = self.textedges._build_bbox_candidate(
-                debug_info_bbox_searches
+                debug_info_bboxes_searches
            )
            if bbox is None:
                break
@ -1040,8 +1031,10 @@ class Hybrid(BaseParser):
                average_tl_height
            )
-            if debug_info is not None:
+            if self.debug_info is not None:
-                debug_info["col_searches"].append({
+                if "col_searches" not in self.debug_info:
                    self.debug_info["col_searches"] = []
                self.debug_info["col_searches"].append({
                    "core_bbox": bbox,
                    "cols_anchors": cols_anchors,
                    "expanded_bbox": expanded_bbox
@ -1148,13 +1141,13 @@ class Hybrid(BaseParser):
        return table
-    def extract_tables(self, debug_info=None):
+    def extract_tables(self):
        if self._document_has_no_text():
            return []
        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
-        self._generate_table_bbox(debug_info)
+        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -164,6 +164,20 @@ class PlotMethods(object):
            ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    @staticmethod
    def draw_pdf(table, ax):
        """Draw the content of the table's source pdf into the passed subplot
        Parameters
        ----------
        table : camelot.core.Table
        fig : matplotlib.axes.Axes
        """
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
    @staticmethod
    def textedge(table):
        """Generates a plot for relevant textedges.
@ -179,6 +193,7 @@ class PlotMethods(object):
        """
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect="equal")
        PlotMethods.draw_pdf(table, ax)
        xs, ys = [], []
        for t in table._text:
            xs.extend([t[0], t[2]])
@ -193,11 +208,13 @@ class PlotMethods(object):
        ax.set_xlim(min(xs) - 10, max(xs) + 10)
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
-        for te in table._textedges:
+        if table.flavor == "hybrid":
-            ax.plot([te.x, te.x], [te.y0, te.y1])
+            # FRHTODO: Clean this up
            table.debug_info["edges_searches"][0].plot_alignments(ax)
        else:
            for te in table._textedges:
                ax.plot([te.x, te.x], [te.y0, te.y1])
        img = table.get_pdf_image()
        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
        return fig
    @staticmethod
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_grid_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_grid_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@ -129,5 +129,5 @@ def test_stream_textedge_plot():
    baseline_dir="files/baseline_plots", remove_text=True)
 def test_hybrid_textedge_plot():
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor='hybrid')
+    tables = camelot.read_pdf(filename, debug=True, flavor='hybrid')
    return camelot.plot(tables[0], kind='textedge')