Further refactoring

2020-04-24 21:11:31 -07:00 · 2020-04-24 21:11:31 -07:00 · bb842f21b9
parent f42557ab8b
commit bb842f21b9
8 changed files with 430 additions and 699 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -15,8 +15,6 @@ from .utils import (
    get_index_closest_point,
    get_textline_coords,
    build_file_path_in_temp_dir,
    compute_accuracy,
    compute_whitespace,
    export_pdf_as_png
 )
@ -141,9 +139,9 @@ class TextAlignments(object):
    def __init__(self, alignment_names):
        # For each possible alignment, list of tuples coordinate/textlines
-        self._textedges = {}
+        self._text_alignments = {}
        for alignment_name in alignment_names:
-            self._textedges[alignment_name] = []
+            self._text_alignments[alignment_name] = []
    @staticmethod
    def _create_new_text_alignment(coord, textline, align):
@ -156,12 +154,12 @@ class TextAlignments(object):
        """Updates an existing text edge in the current dict.
        """
        coords = get_textline_coords(textline)
-        for alignment, edge_array in self._textedges.items():
+        for alignment_id, alignment_array in self._text_alignments.items():
-            coord = coords[alignment]
+            coord = coords[alignment_id]
            # Find the index of the closest existing element (or 0 if none)
            idx_closest = get_index_closest_point(
-                coord, edge_array, fn=lambda x: x.coord
+                coord, alignment_array, fn=lambda x: x.coord
            )
            # Check if the edges before/after are close enough
@ -169,17 +167,25 @@ class TextAlignments(object):
            idx_insert = None
            if idx_closest is None:
                idx_insert = 0
-            elif np.isclose(edge_array[idx_closest].coord, coord, atol=0.5):
+            elif np.isclose(
-                self._update_edge(edge_array[idx_closest], coord, textline)
+                alignment_array[idx_closest].coord,
-            elif edge_array[idx_closest].coord < coord:
+                coord,
                atol=0.5
            ):
                self._update_edge(
                    alignment_array[idx_closest],
                    coord,
                    textline
                )
            elif alignment_array[idx_closest].coord < coord:
                idx_insert = idx_closest + 1
            else:
                idx_insert = idx_closest
            if idx_insert is not None:
-                new_edge = self._create_new_text_alignment(
+                new_alignment = self._create_new_text_alignment(
-                    coord, textline, alignment
+                    coord, textline, alignment_id
                )
-                edge_array.insert(idx_insert, new_edge)
+                alignment_array.insert(idx_insert, new_alignment)
 class TextEdges(TextAlignments):
@ -201,7 +207,7 @@ class TextEdges(TextAlignments):
        """Adds a new text edge to the current dict.
        """
        te = self._create_new_text_alignment(coord, textline, align)
-        self._textedges[align].append(te)
+        self._text_alignments[align].append(te)
    def _update_edge(self, edge, coord, textline):
        edge.update_coords(coord, textline, self.edge_tol)
@ -221,15 +227,15 @@ class TextEdges(TextAlignments):
        """
        intersections_sum = {
            "left": sum(
-                len(te.textlines) for te in self._textedges["left"]
+                len(te.textlines) for te in self._text_alignments["left"]
                if te.is_valid
            ),
            "right": sum(
-                len(te.textlines) for te in self._textedges["right"]
+                len(te.textlines) for te in self._text_alignments["right"]
                if te.is_valid
            ),
            "middle": sum(
-                len(te.textlines) for te in self._textedges["middle"]
+                len(te.textlines) for te in self._text_alignments["middle"]
                if te.is_valid
            ),
        }
@ -240,7 +246,7 @@ class TextEdges(TextAlignments):
        relevant_align = max(intersections_sum.items(), key=itemgetter(1))[0]
        return list(filter(
            lambda te: te.is_valid,
-            self._textedges[relevant_align])
+            self._text_alignments[relevant_align])
        )
    def get_table_areas(self, textlines, relevant_textedges):
@ -443,9 +449,9 @@ class Table(object):
        self.filename = None
        self.order = None
        self.page = None
-        self.flavor = None      # Flavor of the parser that generated the table
+        self.flavor = None         # Flavor of the parser used
-        self.pdf_size = None    # Dimensions of the original PDF page
+        self.pdf_size = None       # Dimensions of the original PDF page
-        self.debug_info = None  # Field holding debug data
+        self.parse_details = None  # Field holding debug data
        self._image = None
        self._image_path = None  # Temporary file to hold an image of the pdf
@ -485,31 +491,6 @@ class Table(object):
        }
        return report
    def record_parse_metadata(self, parser):
        """Record data about the origin of the table
        """
        self.flavor = parser.id
        self.filename = parser.filename
        self.debug_info = parser.debug_info
        pos_errors = parser.compute_parse_errors(self)
        self.accuracy = compute_accuracy([[100, pos_errors]])
        if parser.copy_text is not None:
            self.copy_spanning_text(parser.copy_text)
        data = self.data
        self.df = pd.DataFrame(data)
        self.shape = self.df.shape
        self.whitespace = compute_whitespace(data)
        self.pdf_size = (parser.pdf_width, parser.pdf_height)
        _text = []
        _text.extend(
            [(t.x0, t.y0, t.x1, t.y1) for t in parser.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in parser.vertical_text])
        self._text = _text
    def get_pdf_image(self):
        """Compute pdf image and cache it
        """
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -3,11 +3,18 @@
 import os
 import warnings
 import numpy as np
 import pandas as pd
 from ..utils import (
    bbox_from_str,
    bbox_from_textlines,
    compute_accuracy,
    compute_whitespace,
    get_text_objects,
    get_table_index,
    text_in_bbox,
-    bbox_from_str,
+    text_in_bbox_per_axis,
 )
 from ..core import Table
@ -42,7 +49,7 @@ class BaseParser(object):
        self.t_bbox = None
        # For plotting details of parsing algorithms
-        self.debug_info = {} if debug else None
+        self.parse_details = {} if debug else None
    def prepare_page_parse(self, filename, layout, dimensions,
                           page_idx, layout_kwargs):
@ -63,9 +70,9 @@ class BaseParser(object):
        self.pdf_width, self.pdf_height = self.dimensions
        self.rootname, __ = os.path.splitext(self.filename)
-        if self.debug_info is not None:
+        if self.parse_details is not None:
-            self.debug_info["table_regions"] = self.table_regions
+            self.parse_details["table_regions"] = self.table_regions
-            self.debug_info["table_areas"] = self.table_areas
+            self.parse_details["table_areas"] = self.table_areas
    def _apply_regions_filter(self, textlines):
        """If regions have been specified, filter textlines to these regions.
@ -194,6 +201,31 @@ class BaseParser(object):
        return _tables
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        table.flavor = self.id
        table.filename = self.filename
        table.parse_details = self.parse_details
        pos_errors = self.compute_parse_errors(table)
        table.accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
            table.copy_spanning_text(self.copy_text)
        data = table.data
        table.df = pd.DataFrame(data)
        table.shape = table.df.shape
        table.whitespace = compute_whitespace(data)
        table.pdf_size = (self.pdf_width, self.pdf_height)
        _text = []
        _text.extend(
            [(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.vertical_text])
        table._text = _text
 class TextBaseParser(BaseParser):
    """Base class for all text parsers.
@ -211,15 +243,17 @@ class TextBaseParser(BaseParser):
        edge_tol=50,
        row_tol=2,
        column_tol=0,
        debug=False,
        **kwargs
    ):
        super().__init__(
-            "stream",
+            parser_id,
            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
            debug=debug,
        )
        self.columns = columns
        self._validate_columns()
@ -227,4 +261,271 @@ class TextBaseParser(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol
-        self.textedges = None
+    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = None
        rows = []
        temp = []
        non_empty_text = [t for t in text if t.get_text().strip()]
        for t in non_empty_text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright \
            #   for obj in t._objs
            # if type(obj) is LTChar]):
            if row_y is None:
                row_y = t.y0
            elif not np.isclose(row_y, t.y0, atol=row_tol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                # We update the row's bottom as we go, to be forgiving if there
                # is a gradual change across multiple columns.
                row_y = t.y0
            temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        return rows
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if column_tol >= 0:
                    if higher[0] <= lower[1] or np.isclose(
                        higher[0], lower[1], atol=column_tol
                    ):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif column_tol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1],
                                      atol=abs(column_tol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous. For the row to "touch"
        we split the existing gap between them in half.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_boundaries = [
            [
                max(t.y1 for t in r),
                min(t.y0 for t in r)
            ]
            for r in rows_grouped
        ]
        for i in range(0, len(row_boundaries)-1):
            top_row = row_boundaries[i]
            bottom_row = row_boundaries[i+1]
            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
        row_boundaries[0][0] = text_y_max
        row_boundaries[-1][1] = text_y_min
        return row_boundaries
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = TextBaseParser._group_rows(text, row_tol=row_tol)
            elements = [len(r) for r in text]
            new_cols = [
                (t.x0, t.x1)
                for r in text if len(r) == max(elements)
                for t in r
            ]
            cols.extend(TextBaseParser._merge_columns(sorted(new_cols)))
        return cols
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        # for plotting
        table._bbox = self.table_bbox
        table._segments = None
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        self.record_parse_metadata(table)
        return table
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -5,7 +5,6 @@ from __future__ import division
 import numpy as np
 import copy
 import warnings
 from .base import TextBaseParser
 from ..core import (
@ -17,7 +16,6 @@ from ..core import (
 from ..utils import (
    bbox_from_str,
    text_in_bbox,
    text_in_bbox_per_axis,
    bbox_from_textlines,
    distance_tl_to_bbox,
    find_columns_coordinates
@ -142,11 +140,11 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
 class AlignmentCounter(object):
    """
-    Represents all textlines aligned with a textline for each alignment.
+    For a given textline, represent all other textlines aligned with it.
-    A textline can be vertically aligned with others by having matching left,
+    A textline can be vertically aligned with others if their bbox match on
-    right, or middle edge, and horizontally aligned by having matching top,
+    left, right, or middle coord, and horizontally aligned if they match top,
-    bottom, or center edge.
+    bottom, or center coord.
    """
@ -210,15 +208,15 @@ class AlignmentCounter(object):
 class TextNetworks(TextAlignments):
-    """Text elements connected via both vertical (top, bottom, middle) and
+    """Text elements connected by vertical AND horizontal alignments.
-    horizontal (left, right, and middle) alignments found on the PDF page.
+
    The alignment dict has six keys based on the hor/vert alignments,
    and each key's value is a list of camelot.core.TextAlignment objects.
    """
    def __init__(self):
        super().__init__(ALL_ALIGNMENTS)
-        # For each textline, dictionary "edge type" to
+        # For each textline, dictionary "alignment type" to
        # "number of textlines aligned"
        self._textlines_alignments = {}
@ -226,10 +224,10 @@ class TextNetworks(TextAlignments):
        edge.register_aligned_textline(textline, coord)
    def _register_all_text_lines(self, textlines):
-        """Add all textlines to our edge repository to
+        """Add all textlines to our network repository to
        identify alignments.
        """
-        # Identify all the edge alignments
+        # Identify all the alignments
        for tl in textlines:
            if len(tl.get_text().strip()) > 0:
                self._register_textline(tl)
@ -237,7 +235,7 @@ class TextNetworks(TextAlignments):
    def _compute_alignment_counts(self):
        """Build a dictionary textline -> alignment object.
        """
-        for align_id, textedges in self._textedges.items():
+        for align_id, textedges in self._text_alignments.items():
            for textedge in textedges:
                for textline in textedge.textlines:
                    alignments = self._textlines_alignments.get(
@ -254,8 +252,8 @@ class TextNetworks(TextAlignments):
        the core table.
        """
        h_gaps, v_gaps = [], []
-        for align_id in self._textedges:
+        for align_id in self._text_alignments:
-            edge_array = self._textedges[align_id]
+            edge_array = self._text_alignments[align_id]
            gaps = []
            vertical = align_id in HORIZONTAL_ALIGNMENTS
            sort_function = (lambda tl: tl.y0) \
@ -299,7 +297,7 @@ class TextNetworks(TextAlignments):
        removed_singletons = True
        while removed_singletons:
            removed_singletons = False
-            for alignment_id, textalignments in self._textedges.items():
+            for alignment_id, textalignments in self._text_alignments.items():
                # For each alignment edge, remove items if they are singletons
                # either horizontally or vertically
                for ta in textalignments:
@ -313,7 +311,7 @@ class TextNetworks(TextAlignments):
            self._textlines_alignments = {}
            self._compute_alignment_counts()
-    def _most_connected_textline(self):
+    def most_connected_textline(self):
        """ Retrieve the textline that is most connected across vertical and
        horizontal axis.
@ -340,7 +338,7 @@ class TextNetworks(TextAlignments):
        # alignments across horizontal and vertical axis.
        # It will serve as a reference axis along which to collect the average
        # spacing between rows/cols.
-        most_aligned_tl = self._most_connected_textline()
+        most_aligned_tl = self.most_connected_textline()
        if most_aligned_tl is None:
            return None
@ -378,7 +376,7 @@ class TextNetworks(TextAlignments):
        )
        return gaps_hv
-    def _build_bbox_candidate(self, gaps_hv, debug_info=None):
+    def _build_bbox_candidate(self, gaps_hv, parse_details=None):
        """ Seed the process with the textline with the highest alignment
        score, then expand the bbox with textlines within threshold.
@ -387,7 +385,7 @@ class TextNetworks(TextAlignments):
        gaps_hv : tuple
             The maximum distance allowed to consider surrounding lines/columns
             as part of the same table.
-        debug_info : array (optional)
+        parse_details : array (optional)
            Optional parameter array, in which to store extra information
            to help later visualization of the table creation.
        """
@ -396,23 +394,23 @@ class TextNetworks(TextAlignments):
        # It will serve both as a starting point for the table boundary
        # search, and as a way to estimate the average spacing between
        # rows/cols.
-        most_aligned_tl = self._most_connected_textline()
+        most_aligned_tl = self.most_connected_textline()
        # Calculate the 75th percentile of the horizontal/vertical
        # gaps between textlines.  Use this as a reference for a threshold
        # to not exceed while looking for table boundaries.
        max_h_gap, max_v_gap = gaps_hv[0], gaps_hv[1]
-        if debug_info is not None:
+        if parse_details is not None:
            # Store debug info
-            debug_info_search = {
+            parse_details_search = {
                "max_h_gap": max_h_gap,
                "max_v_gap": max_v_gap,
                "iterations": []
            }
-            debug_info.append(debug_info_search)
+            parse_details.append(parse_details_search)
        else:
-            debug_info_search = None
+            parse_details_search = None
        MINIMUM_TEXTLINES_IN_TABLE = 6
        bbox = (most_aligned_tl.x0, most_aligned_tl.y0,
@ -426,9 +424,9 @@ class TextNetworks(TextAlignments):
        tls_in_bbox = [most_aligned_tl]
        last_bbox = None
        while last_bbox != bbox:
-            if debug_info_search is not None:
+            if parse_details_search is not None:
                # Store debug info
-                debug_info_search["iterations"].append(bbox)
+                parse_details_search["iterations"].append(bbox)
            last_bbox = bbox
            # Go through all remaining textlines, expand our bbox
@ -461,35 +459,6 @@ class TextNetworks(TextAlignments):
        self._register_all_text_lines(textlines)
        self._compute_alignment_counts()
    def plot_alignments(self, ax):
        """Displays a visualization of the alignments as currently computed.
        """
        # FRHTODO: This is too busy and doesn't plot lines
        most_aligned_tl = sorted(
            self._textlines_alignments.keys(),
            key=lambda textline:
            self._textlines_alignments[textline].alignment_score(),
            reverse=True
        )[0]
        ax.add_patch(
            patches.Rectangle(
                (most_aligned_tl.x0, most_aligned_tl.y0),
                most_aligned_tl.x1 - most_aligned_tl.x0,
                most_aligned_tl.y1 - most_aligned_tl.y0,
                color="red",
                alpha=0.5
            )
        )
        for tl, alignments in self._textlines_alignments.items():
            ax.text(
                tl.x0 - 5,
                tl.y0 - 5,
                f"{alignments.max_h_count()}x{alignments.max_v_count()}",
                fontsize=5,
                color="black"
            )
 class Hybrid(TextBaseParser):
    """Hybrid method of parsing looks for spaces between text
@ -555,190 +524,9 @@ class Hybrid(TextBaseParser):
            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
            debug=debug,
        )
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = None
        rows = []
        temp = []
        non_empty_text = [t for t in text if t.get_text().strip()]
        for t in non_empty_text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright \
            #   for obj in t._objs
            # if type(obj) is LTChar]):
            if row_y is None:
                row_y = t.y0
            elif not np.isclose(row_y, t.y0, atol=row_tol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                # We update the row's bottom as we go, to be forgiving if there
                # is a gradual change across multiple columns.
                row_y = t.y0
            temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        return rows
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if column_tol >= 0:
                    if higher[0] <= lower[1] or np.isclose(
                        higher[0], lower[1], atol=column_tol
                    ):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif column_tol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1],
                                      atol=abs(column_tol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous. For the row to "touch"
        we split the existing gap between them in half.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_boundaries = [
            [
                max(t.y1 for t in r),
                min(t.y0 for t in r)
            ]
            for r in rows_grouped
        ]
        for i in range(0, len(row_boundaries)-1):
            top_row = row_boundaries[i]
            bottom_row = row_boundaries[i+1]
            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
        row_boundaries[0][0] = text_y_max
        row_boundaries[-1][1] = text_y_min
        return row_boundaries
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Add columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = Hybrid._group_rows(text, row_tol=row_tol)
            elements = [len(r) for r in text]
            new_cols = [
                (t.x0, t.x1)
                for r in text if len(r) == max(elements)
                for t in r
            ]
            cols.extend(Hybrid._merge_columns(sorted(new_cols)))
        return cols
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols
    # FRHTODO: Check is needed, refactor with Stream
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _generate_table_bbox(self):
        if self.table_areas is not None:
            table_bbox = {}
@ -756,25 +544,21 @@ class Hybrid(TextBaseParser):
        textlines_processed = {}
        self.table_bbox = {}
-        if self.debug_info is not None:
+        if self.parse_details is not None:
-            debug_info_edges_searches = []
+            parse_details_network_searches = []
-            self.debug_info["edges_searches"] = debug_info_edges_searches
+            self.parse_details["network_searches"] = \
-            debug_info_bboxes_searches = []
+                parse_details_network_searches
-            self.debug_info["bboxes_searches"] = debug_info_bboxes_searches
+            parse_details_bbox_searches = []
            self.parse_details["bbox_searches"] = parse_details_bbox_searches
        else:
-            debug_info_edges_searches = None
+            parse_details_network_searches = None
-            debug_info_bboxes_searches = None
+            parse_details_bbox_searches = None
        while True:
-            self.textedges = TextNetworks()
+            text_network = TextNetworks()
-            self.textedges.generate(textlines)
+            text_network.generate(textlines)
-            self.textedges._remove_unconnected_edges()
+            text_network._remove_unconnected_edges()
-            if debug_info_edges_searches is not None:
+            gaps_hv = text_network._compute_plausible_gaps()
                # Preserve the current edge calculation for display debugging
                debug_info_edges_searches.append(
                    copy.deepcopy(self.textedges)
                )
            gaps_hv = self.textedges._compute_plausible_gaps()
            if gaps_hv is None:
                return None
            # edge_tol instructions override the calculated vertical gap
@ -782,13 +566,19 @@ class Hybrid(TextBaseParser):
                gaps_hv[0],
                gaps_hv[1] if self.edge_tol is None else self.edge_tol
            )
-            bbox = self.textedges._build_bbox_candidate(
+            bbox = text_network._build_bbox_candidate(
                edge_tol_hv,
-                debug_info=debug_info_bboxes_searches
+                parse_details=parse_details_bbox_searches
            )
            if bbox is None:
                break
            if parse_details_network_searches is not None:
                # Preserve the current edge calculation for display debugging
                parse_details_network_searches.append(
                    copy.deepcopy(text_network)
                )
            # Get all the textlines that are at least 50% in the box
            tls_in_bbox = text_in_bbox(bbox, textlines)
@ -808,10 +598,10 @@ class Hybrid(TextBaseParser):
                gaps_hv[1]
            )
-            if self.debug_info is not None:
+            if self.parse_details is not None:
-                if "col_searches" not in self.debug_info:
+                if "col_searches" not in self.parse_details:
-                    self.debug_info["col_searches"] = []
+                    self.parse_details["col_searches"] = []
-                self.debug_info["col_searches"].append({
+                self.parse_details["col_searches"].append({
                    "core_bbox": bbox,
                    "cols_anchors": cols_anchors,
                    "expanded_bbox": expanded_bbox
@ -826,95 +616,3 @@ class Hybrid(TextBaseParser):
                lambda tl: tl not in textlines_processed,
                textlines
            ))
    # FRHTODO: Check is needed, refactor with Stream
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
    # FRHTODO: Check is needed, refactor with Stream
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        table.record_parse_metadata(self)
        # for plotting
        table._bbox = self.table_bbox
        table._segments = None
        table._textedges = self.textedges
        return table
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -168,6 +168,15 @@ class Lattice(BaseParser):
            indices.append((r_idx, c_idx, text))
        return indices
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        # for plotting
        table._image = self.pdf_image  # Reuse the image used for calc
        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)
    def _generate_table_bbox(self):
        def scale_areas(areas):
            scaled_areas = []
@ -293,12 +302,5 @@ class Lattice(BaseParser):
        # set spanning cells to True
        table = table.set_span()
-        table.record_parse_metadata(self)
+        self.record_parse_metadata(table)
        # for plotting
        table._image = self.pdf_image  # Reuse the image used for calc
        table._bbox_unscaled = self.table_bbox_unscaled
        table._segments = (self.vertical_segments, self.horizontal_segments)
        table._textedges = None
        return table
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -1,17 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 import warnings
 import numpy as np
 from .base import TextBaseParser
 from ..core import TextEdges
 from ..utils import (
    bbox_from_str,
-    bbox_from_textlines,
+    text_in_bbox
    text_in_bbox,
    text_in_bbox_per_axis
 )
@ -79,182 +74,7 @@ class Stream(TextBaseParser):
            row_tol=row_tol,
            column_tol=column_tol,
        )
-
+        self.textedges = []
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
        within a tolerance.
        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)
        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.
        """
        row_y = None
        rows = []
        temp = []
        non_empty_text = [t for t in text if t.get_text().strip()]
        for t in non_empty_text:
            # is checking for upright necessary?
            # if t.get_text().strip() and all([obj.upright \
            #   for obj in t._objs
            # if type(obj) is LTChar]):
            if row_y is None:
                row_y = t.y0
            elif not np.isclose(row_y, t.y0, atol=row_tol):
                rows.append(sorted(temp, key=lambda t: t.x0))
                temp = []
                # We update the row's bottom as we go, to be forgiving if there
                # is a gradual change across multiple columns.
                row_y = t.y0
            temp.append(t)
        rows.append(sorted(temp, key=lambda t: t.x0))
        return rows
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
        or lie within a tolerance.
        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)
        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.
        """
        merged = []
        for higher in l:
            if not merged:
                merged.append(higher)
            else:
                lower = merged[-1]
                if column_tol >= 0:
                    if higher[0] <= lower[1] or np.isclose(
                        higher[0], lower[1], atol=column_tol
                    ):
                        upper_bound = max(lower[1], higher[1])
                        lower_bound = min(lower[0], higher[0])
                        merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
                elif column_tol < 0:
                    if higher[0] <= lower[1]:
                        if np.isclose(higher[0], lower[1],
                                      atol=abs(column_tol)):
                            merged.append(higher)
                        else:
                            upper_bound = max(lower[1], higher[1])
                            lower_bound = min(lower[0], higher[0])
                            merged[-1] = (lower_bound, upper_bound)
                    else:
                        merged.append(higher)
        return merged
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous. For the row to "touch"
        we split the existing gap between them in half.
        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int
        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.
        """
        row_boundaries = [
            [
                max(t.y1 for t in r),
                min(t.y0 for t in r)
            ]
            for r in rows_grouped
        ]
        for i in range(0, len(row_boundaries)-1):
            top_row = row_boundaries[i]
            bottom_row = row_boundaries[i+1]
            top_row[1] = bottom_row[0] = (top_row[1] + bottom_row[0]) / 2
        row_boundaries[0][0] = text_y_max
        row_boundaries[-1][1] = text_y_min
        return row_boundaries
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        if text:
            text = Stream._group_rows(text, row_tol=row_tol)
            elements = [len(r) for r in text]
            new_cols = [
                (t.x0, t.x1)
                for r in text if len(r) == max(elements)
                for t in r
            ]
            cols.extend(Stream._merge_columns(sorted(new_cols)))
        return cols
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int
        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.
        """
        cols = sorted(cols)
        cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
        cols.insert(0, text_x_min)
        cols.append(text_x_max)
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")
    def _nurminen_table_detection(self, textlines):
        """A general implementation of the table detection algorithm
@ -281,8 +101,13 @@ class Stream(TextBaseParser):
        return table_bbox
    def record_parse_metadata(self, table):
        """Record data about the origin of the table
        """
        super().record_parse_metadata(table)
        table._textedges = self.textedges
    def _generate_table_bbox(self):
        self.textedges = []
        if self.table_areas is None:
            hor_text = self.horizontal_text
            if self.table_regions is not None:
@ -300,93 +125,3 @@ class Stream(TextBaseParser):
            for area_str in self.table_areas:
                table_bbox[bbox_from_str(area_str)] = None
        self.table_bbox = table_bbox
    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
            bbox,
            self.horizontal_text,
            self.vertical_text
        )
        text_x_min, text_y_min, text_x_max, text_y_max = bbox_from_textlines(
            self.t_bbox["horizontal"] + self.t_bbox["vertical"]
        )
        rows_grouped = self._group_rows(
            self.t_bbox["horizontal"], row_tol=self.row_tol)
        rows = self._join_rows(rows_grouped, text_y_max, text_y_min)
        elements = [len(r) for r in rows_grouped]
        if self.columns is not None and self.columns[table_idx] != "":
            # user has to input boundary columns too
            # take (0, pdf_width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns[table_idx].split(",")
            cols = [float(c) for c in cols]
            cols.insert(0, text_x_min)
            cols.append(text_x_max)
            cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        else:
            # calculate mode of the list of number of elements in
            # each row to guess the number of columns
            ncols = max(set(elements), key=elements.count)
            if ncols == 1:
                # if mode is 1, the page usually contains not tables
                # but there can be cases where the list can be skewed,
                # try to remove all 1s from list in this case and
                # see if the list contains elements, if yes, then use
                # the mode after removing 1s
                elements = list(filter(lambda x: x != 1, elements))
                if elements:
                    ncols = max(set(elements), key=elements.count)
                else:
                    warnings.warn(
                        "No tables found in table area {}"
                        .format(table_idx + 1)
                    )
            cols = [
                (t.x0, t.x1)
                for r in rows_grouped
                if len(r) == ncols
                for t in r
            ]
            cols = self._merge_columns(
                sorted(cols),
                column_tol=self.column_tol
            )
            inner_text = []
            for i in range(1, len(cols)):
                left = cols[i - 1][1]
                right = cols[i][0]
                inner_text.extend(
                    [
                        t
                        for direction in self.t_bbox
                        for t in self.t_bbox[direction]
                        if t.x0 > left and t.x1 < right
                    ]
                )
            outer_text = [
                t
                for direction in self.t_bbox
                for t in self.t_bbox[direction]
                if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
            ]
            inner_text.extend(outer_text)
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
        return cols, rows, None, None
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        table.record_parse_metadata(self)
        # for plotting
        table._bbox = self.table_bbox
        table._segments = None
        table._textedges = self.textedges
        return table
--- a/camelot/plotting.py
+++ b/camelot/plotting.py
@ -87,9 +87,9 @@ def draw_parse_constraints(table, ax):
    ax : matplotlib.axes.Axes
    """
-    if table.debug_info:
+    if table.parse_details:
        # Display a bbox per region
-        for region_str in table.debug_info["table_regions"] or []:
+        for region_str in table.parse_details["table_regions"] or []:
            draw_labeled_bbox(
                ax, bbox_from_str(region_str),
                "region: ({region_str})".format(region_str=region_str),
@ -99,7 +99,7 @@ def draw_parse_constraints(table, ax):
                label_pos="bottom,right"
            )
        # Display a bbox per area
-        for area_str in table.debug_info["table_areas"] or []:
+        for area_str in table.parse_details["table_areas"] or []:
            draw_labeled_bbox(
                ax, bbox_from_str(area_str),
                "area: ({area_str})".format(area_str=area_str),
@ -294,8 +294,27 @@ class PlotMethods(object):
        ax.set_ylim(min(ys) - 10, max(ys) + 10)
        if table.flavor == "hybrid":
-            # FRHTODO: Clean this up
+            for text_network in table.parse_details["network_searches"]:
-            table.debug_info["edges_searches"][0].plot_alignments(ax)
+                # FRHTODO: This is too busy and doesn't plot lines
                most_connected_tl = text_network.most_connected_textline()
                ax.add_patch(
                    patches.Rectangle(
                        (most_connected_tl.x0, most_connected_tl.y0),
                        most_connected_tl.x1 - most_connected_tl.x0,
                        most_connected_tl.y1 - most_connected_tl.y0,
                        color="red",
                        alpha=0.5
                    )
                )
                for tl, alignments in text_network._textlines_alignments.items():
                    ax.text(
                        tl.x0 - 5,
                        tl.y0 - 5,
                        f"{alignments.max_h_count()}x{alignments.max_v_count()}",
                        fontsize=5,
                        color="black"
                    )
        else:
            for te in table._textedges:
                ax.plot([te.coord, te.coord], [te.y0, te.y1])
@ -372,10 +391,10 @@ class PlotMethods(object):
        draw_pdf(table, ax)
        draw_parse_constraints(table, ax)
-        if table.debug_info is None:
+        if table.parse_details is None:
            return fig
-        debug_info = table.debug_info
+        parse_details = table.parse_details
-        for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
+        for box_id, bbox_search in enumerate(parse_details["bbox_searches"]):
            max_h_gap = bbox_search["max_h_gap"]
            max_v_gap = bbox_search["max_v_gap"]
            iterations = bbox_search["iterations"]
@ -403,7 +422,7 @@ class PlotMethods(object):
                    )
                )
-        for box_id, col_search in enumerate(debug_info["col_searches"]):
+        for box_id, col_search in enumerate(parse_details["col_searches"]):
            draw_labeled_bbox(
                ax, col_search["expanded_bbox"],
                "box body + header #{box_id}".format(
@ -422,10 +441,5 @@ class PlotMethods(object):
                linewidth=2,
                label_pos="bottom,left"
            )
            # self.debug_info["col_searches"].append({
            #     "core_bbox": bbox,
            #     "cols_anchors": cols_anchors,
            #     "expanded_bbox": expanded_bbox
            # })
        return fig
--- a/tests/files/baseline_plots/test_hybrid_contour_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_contour_plot.png
--- a/tests/files/baseline_plots/test_hybrid_textedge_plot.png
+++ b/tests/files/baseline_plots/test_hybrid_textedge_plot.png