Interim check-in, test failing and lots of todos

2020-04-19 18:26:38 -07:00 · 2020-04-19 18:26:38 -07:00 · f5fe92c22e
parent c1c9358778
commit f5fe92c22e
3 changed files with 691 additions and 60 deletions
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -1,15 +1,620 @@
-
-
 # -*- coding: utf-8 -*-
-
 from __future__ import division
-import warnings

 import numpy as np

+import warnings
+
 from .base import BaseParser
-from ..core import TextEdges
-from ..utils import (text_in_bbox, text_in_bbox_per_axis)
+from ..utils import (
+    text_in_bbox,
+    text_in_bbox_per_axis,
+    bbox_from_text,
+    distance_tl_to_bbox,
+    find_columns_coordinates
+)
+
+from matplotlib import patches as patches
+
+# FRHTODO: Move to utils
+# maximum number of columns over which a header can spread
+MAX_COL_SPREAD_IN_HEADER = 3
+
+
+def todo_move_me_expand_area_for_header(area, textlines, col_anchors,
+                                        average_row_height):
+    """The core algorithm is based on fairly strict alignment of text.
+    It works ok for the table body, but might fail on tables' headers
+    since they tend to be in a different font, alignment (e.g. vertical),
+    etc.
+    The section below tries to identify whether what's above the bbox
+    identified so far has the characteristics of a table header:
+    Close to the top of the body, with cells that fit within the bounds
+    identified.
+    """
+    new_area = area
+    (left, bottom, right, top) = area
+    zones = []
+
+    def column_spread(left, right, col_anchors):
+        """Returns the number of columns (splits on the x-axis)
+        crossed by an element covering left to right.
+        """
+        indexLeft = 0
+        while indexLeft < len(col_anchors) \
+                and col_anchors[indexLeft] < left:
+            indexLeft += 1
+        indexRight = indexLeft
+        while indexRight < len(col_anchors) \
+                and col_anchors[indexRight] < right:
+            indexRight += 1
+
+        return indexRight - indexLeft
+
+    keep_searching = True
+    while keep_searching:
+        keep_searching = False
+        # a/ first look for the closest text element above the area.
+        # It will be the anchor for a possible new row.
+        closest_above = None
+        all_above = []
+        for te in textlines:
+            # higher than the table, directly within its bounds
+            if te.y0 > top and te.x0 > left and te.x1 < right:
+                all_above.append(te)
+                if closest_above is None or closest_above.y0 > te.y0:
+                    closest_above = te
+
+        if closest_above and \
+                closest_above.y0 < top + average_row_height:
+            # b/ We have a candidate cell that is within the correct
+            # vertical band, and directly above the table. Starting from
+            # this anchor, we list all the textlines within the same row.
+            tls_in_new_row = []
+            top = closest_above.y1
+            pushed_up = True
+            while pushed_up:
+                pushed_up = False
+                # Iterate and extract elements that fit in the row
+                # from our list
+                for i in range(len(all_above) - 1, -1, -1):
+                    te = all_above[i]
+                    if te.y0 < top:
+                        # The bottom of this element is within our row
+                        # so we add it.
+                        tls_in_new_row.append(te)
+                        all_above.pop(i)
+                        if te.y1 > top:
+                            # If the top of this element raises our row's
+                            # band, we'll need to keep on searching for
+                            # overlapping items
+                            top = te.y1
+                            pushed_up = True
+
+            # Get the x-ranges for all the textlines, and merge the
+            # x-ranges that overlap
+            zones = zones + \
+                list(map(lambda tl: [tl.x0, tl.x1], tls_in_new_row))
+            zones.sort(key=lambda z: z[0])  # Sort by left coordinate
+            # Starting from the right, if two zones overlap horizontally,
+            # merge them
+            merged_something = True
+            while merged_something:
+                merged_something = False
+                for i in range(len(zones) - 1, 0, -1):
+                    zone_right = zones[i]
+                    zone_left = zones[i-1]
+                    if zone_left[1] >= zone_right[0]:
+                        zone_left[1] = max(zone_right[1], zone_left[1])
+                        zones.pop(i)
+                        merged_something = True
+
+            max_spread = max(
+                list(
+                    map(
+                        lambda zone: column_spread(
+                            zone[0], zone[1], col_anchors),
+                        zones
+                    )
+                )
+            )
+            if max_spread <= MAX_COL_SPREAD_IN_HEADER:
+                # Combined, the elements we've identified don't cross more
+                # than the authorized number of columns.
+                # We're trying to avoid
+                # 0: <BAD: Added header spans too broad>
+                # 1: <A1>    <B1>    <C1>    <D1>    <E1>
+                # 2: <A2>    <B2>    <C2>    <D2>    <E2>
+                # if len(zones) > TEXTEDGE_REQUIRED_ELEMENTS:
+                new_area = (left, bottom, right, top)
+
+                # At this stage we've identified a plausible row (or the
+                # beginning of one).
+                keep_searching = True
+
+    return new_area
+
+
+class TextEdge2(object):
+    """Defines a text edge coordinates relative to a left-bottom
+    origin. (PDF coordinate space)
+
+    Parameters
+    ----------
+    coord : float
+        coordinate of the text edge. Depending on the alignment
+        it could be a vertical or horizontal coordinate.
+
+    Attributes
+    ----------
+    textlines: array
+        Array of textlines that demonstrate this alignment.
+    coord: float
+        The coordinate aligned averaged out across textlines.
+
+    """
+
+    def __init__(self, coord, textline):
+        self.coord = coord
+        self.textlines = [textline]
+
+    def __repr__(self):
+        text_inside = " | ".join(
+            map(lambda x: x.get_text(), self.textlines[:2])).replace("\n", "")
+        return f"<TextEdge coord={self.coord} tl={len(self.textlines)} " \
+               f"textlines text='{text_inside}...'>"
+
+    def register_aligned_textline(self, textline, coord):
+        """Updates new textline to this alignment, adapting its average.
+        """
+        # Increase the intersections for this segment, expand it up,
+        # and adjust the x based on the new value
+        self.coord = (self.coord * len(self.textlines) + coord) / \
+            float(len(self.textlines) + 1)
+        self.textlines.append(textline)
+
+
+class Alignments(object):
+    """Represents the number of other textlines aligned with this
+    one across each edge.
+    """
+
+    def __init__(self):
+        # Vertical alignments
+        self.left = 0
+        self.right = 0
+        self.middle = 0
+
+        # Horizontal alignments
+        self.bottom = 0
+        self.top = 0
+        self.center = 0
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+    def max_v(self):
+        """Returns the maximum number of alignments along
+        one of the vertical axis (left/right/middle).
+        """
+        return max(self.left, self.right, self.middle)
+
+    def max_h(self):
+        """Returns the maximum number of alignments along
+        one of the horizontal axis (bottom/top/center).
+        """
+        return max(self.bottom, self.top, self.center)
+
+    def max_v_edge_name(self):
+        """Returns the name of the vertical edge that has the
+        maximum number of alignments.
+        """
+        return max(
+            ["left", "right", "middle"],
+            key=lambda edge_name: self[edge_name]
+        )
+
+    def max_h_edge_name(self):
+        """Returns the name of the horizontal edge that has the
+        maximum number of alignments.
+        """
+        return max(
+            ["bottom", "top", "center"],
+            key=lambda edge_name: self[edge_name]
+        )
+
+    def alignment_score(self):
+        """We define the alignment score of a textline as the product of the
+        number of aligned elements - 1. The -1 is to avoid favoring
+         singletons on a long line.
+        """
+        return (self.max_v()-1) * (self.max_h()-1)
+
+
+class TextEdges2(object):
+    """Defines a dict of vertical (top, bottom, middle) and
+    horizontal (left, right, and middle) text alignments found on
+    the PDF page. The dict has three keys based on the alignments,
+    and each key's value is a list of camelot.core.TextEdge objects.
+    """
+
+    def __init__(self):
+        # For each possible alignment, list of tuples coordinate/textlines
+        self._textedges = {
+            "left": [],
+            "right": [],
+            "middle": [],
+            "bottom": [],
+            "top": [],
+            "center": []
+        }
+        # For each textline, dictionary "edge type" to
+        # "number of textlines aligned"
+        self._textlines_alignments = {}
+
+        # Maximum number of distinct aligned elements in rows/cols
+        self.max_rows = None
+        self.max_cols = None
+
+    @staticmethod
+    def get_textline_coords(textline):
+        """Calculate the coordinates of each alignment
+        for a given textline.
+        """
+        return {
+            "left": textline.x0,
+            "right": textline.x1,
+            "middle": (textline.x0 + textline.x1) / 2.0,
+            "bottom": textline.y0,
+            "top": textline.y1,
+            "center": (textline.y0 + textline.y1) / 2.0,
+        }
+
+    @staticmethod
+    def _get_index_closest_point(coord, edge_array):
+        """Returns the index of the closest point
+        """
+        n = len(edge_array)
+        if n == 0:
+            return None
+        if n == 1:
+            return 0
+
+        left = 0
+        right = n - 1
+        mid = 0
+
+        if coord >= edge_array[n - 1].coord:
+            return n - 1
+        if coord <= edge_array[0].coord:
+            return 0
+
+        while left < right:
+            mid = (left + right) // 2  # find the mid
+            if coord < edge_array[mid].coord:
+                right = mid
+            elif coord > edge_array[mid].coord:
+                left = mid + 1
+            else:
+                return mid
+
+        if edge_array[mid].coord > coord:
+            if mid > 0 and (
+                coord - edge_array[mid-1].coord <
+                    edge_array[mid].coord - coord):
+                return mid-1
+        elif edge_array[mid].coord < coord:
+            if mid < n - 1 and (
+                edge_array[mid+1].coord - coord <
+                    coord - edge_array[mid].coord):
+                return mid+1
+        return mid
+
+    # def insert(self, index, textline, align):
+    #     """Adds a new text edge to the current dict.
+    #     """
+    #     x = self.get_x_coord(textline, align)
+    #     y0 = textline.y0
+    #     y1 = textline.y1
+    #     te = TextEdge(x, y0, y1, align=align)
+    #     self._textedges[align].insert(index, te)
+
+    def _register_textline(self, textline):
+        """Updates an existing text edge in the current dict.
+        """
+        coords = TextEdges2.get_textline_coords(textline)
+        for alignment in self._textedges:
+            edge_array = self._textedges[alignment]
+            coord = coords[alignment]
+
+            # Find the index of the closest existing element (or 0 if none)
+            idx_closest = self._get_index_closest_point(coord, edge_array)
+
+            # Check if the edges before/after are close enough
+            # that it can be considered aligned
+            idx_insert = None
+            if idx_closest is None:
+                idx_insert = 0
+            elif np.isclose(edge_array[idx_closest].coord, coord, atol=0.5):
+                closest_edge = edge_array[idx_closest]
+                closest_edge.register_aligned_textline(textline, coord)
+            elif edge_array[idx_closest].coord < coord:
+                idx_insert = idx_closest + 1
+            else:
+                idx_insert = idx_closest
+            if idx_insert is not None:
+                new_edge = TextEdge2(coord, textline)
+                edge_array.insert(idx_insert, new_edge)
+
+    def _register_all_text_lines(self, textlines):
+        """Add all textlines to our edge repository to
+        identify alignments.
+        """
+        # Identify all the edge alignments
+        for tl in textlines:
+            if len(tl.get_text().strip()) > 0:
+                self._register_textline(tl)
+
+    def _compute_alignment_counts(self):
+        """Build a dictionary textline -> alignment object.
+        """
+        #
+        for edge_name, textedges in self._textedges.items():
+            for textedge in textedges:
+                for textline in textedge.textlines:
+                    textline_alignments = self._textlines_alignments.get(
+                        textline, None)
+                    if textline_alignments is None:
+                        alignments = Alignments()
+                        alignments[edge_name] = len(textedge.textlines)
+                        self._textlines_alignments[textline] = alignments
+                    else:
+                        textline_alignments[edge_name] = len(
+                            textedge.textlines)
+
+        # Finally calculate the overall maximum number of rows/cols
+        self.max_rows = max(
+            map(
+                lambda alignments: alignments.max_h(),
+                self._textlines_alignments.values()
+            ),
+            default=0
+        )
+        self.max_cols = max(
+            map(
+                lambda alignments: alignments.max_v(),
+                self._textlines_alignments.values()
+            ),
+            default=0
+        )
+
+    def _calculate_gaps_thresholds(self, percentile=75):
+        """Identify reasonable gaps between lines and columns based
+        on gaps observed across alignments.
+        This can be used to reject cells as too far away from
+        the core table.
+        """
+        h_gaps, v_gaps = [], []
+        for edge_name in self._textedges:
+            edge_array = self._textedges[edge_name]
+            gaps = []
+            vertical = edge_name in ["left", "right", "middle"]
+            sort_function = (lambda tl: tl.y0) \
+                if vertical \
+                else (lambda tl: tl.x0)
+            for alignments in edge_array:
+                tls = sorted(
+                    alignments.textlines,
+                    key=sort_function,
+                    reverse=True
+                )
+                for i in range(1, len(tls)):
+                    # If the lines are vertically aligned (stacked up), we
+                    # record the vertical gap between them
+                    if vertical:
+                        gap = tls[i-1].y1 - tls[i].y0
+                    else:
+                        gap = tls[i-1].x1 - tls[i].x0
+                    gaps.append(gap)
+            if gaps:
+                if vertical:
+                    v_gaps.append(np.percentile(gaps, percentile))
+                else:
+                    h_gaps.append(np.percentile(gaps, percentile))
+                direction_str = 'vertical' if vertical else 'horizontal'
+                rounded_gaps = list(map(lambda x: round(x, 2), gaps))
+                print(
+                    f"{direction_str} gaps found "
+                    f"for {edge_name}: "
+                    f"{rounded_gaps} "
+                    f"with {percentile}th percentile "
+                    f"{np.percentile(gaps, percentile)}"
+                )
+        return max(h_gaps, default=None), max(v_gaps, default=None)
+
+    def _remove_unconnected_edges(self):
+        """Weed out elements which are only connected to others vertically
+        or horizontally. There needs to be connections across both
+        dimensions.
+        """
+        singleton_textlines = []
+        removed_singletons = True
+        while removed_singletons:
+            removed_singletons = False
+            for edge_type in self._textedges:
+                # For each alignment edge, remove items if they are singletons
+                # either horizontally or vertically
+                for te in self._textedges[edge_type]:
+                    for i in range(len(te.textlines) - 1, -1, -1):
+                        tl = te.textlines[i]
+                        alignments = self._textlines_alignments[tl]
+                        if alignments.max_h() <= 1 or alignments.max_v() <= 1:
+                            singleton_textlines.append(tl)
+                            del te.textlines[i]
+                            removed_singletons = True
+            self._textlines_alignments = {}
+            self._compute_alignment_counts()
+
+    def _build_bbox_candidate(self, debug_info=None):
+        """ Seed the process with the textline with the highest alignment
+        score, then expand the bbox with textlines within threshold.
+
+        Parameters
+        ----------
+        debug_info : array
+            Optional parameter array, in which to store extra information
+            to help later visualization of the table creation.
+        """
+        if self.max_rows <= 1 or self.max_cols <= 1:
+            return None
+        tls_search_space = list(self._textlines_alignments.keys())
+
+        def get_best_textline(textlines):
+            # Find the textline with the highest alignment score
+            return max(
+                textlines,
+                key=lambda textline:
+                    self._textlines_alignments[textline].alignment_score(),
+                default=None
+            )
+
+        # Calculate the 75th percentile of the horizontal/vertical
+        # gaps between textlines.  Use this as a reference for a threshold
+        # to not exceed while looking for table boundaries.
+        gaps_hv = self._calculate_gaps_thresholds(75)
+        if (gaps_hv[0] is None or gaps_hv[1] is None):
+            return None
+        max_h_gap, max_v_gap = gaps_hv[0] * 3, gaps_hv[1] * 3
+
+        if debug_info is not None:
+            # Store debug info
+            debug_info_search = {
+                "max_h_gap": max_h_gap,
+                "max_v_gap": max_v_gap,
+                "iterations": []
+            }
+            debug_info.append(debug_info_search)
+        else:
+            debug_info_search = None
+
+        MINIMUM_TEXTLINES_IN_TABLE = 6
+        tl_most_aligned = get_best_textline(tls_search_space)
+        bbox = (tl_most_aligned.x0, tl_most_aligned.y0,
+                tl_most_aligned.x1, tl_most_aligned.y1)
+        tls_search_space.remove(tl_most_aligned)
+        tls_in_bbox = [tl_most_aligned]
+        last_bbox = None
+        while last_bbox != bbox:
+            if debug_info_search is not None:
+                # Store debug info
+                debug_info_search["iterations"].append(bbox)
+
+            last_bbox = bbox
+            # Go through all remaining textlines, expand our bbox
+            # if a textline is within our proximity tolerance
+            for i in range(len(tls_search_space) - 1, -1, -1):
+                tl = tls_search_space[i]
+                h_distance, v_distance = distance_tl_to_bbox(tl, bbox)
+
+                # Move textline to our bbox and expand the bbox accordingly
+                # if the textline is close.
+                if h_distance < max_h_gap and v_distance < max_v_gap:
+                    tls_in_bbox.append(tl)
+                    bbox = (
+                        min(bbox[0], tl.x0),
+                        min(bbox[1], tl.y0),
+                        max(bbox[2], tl.x1),
+                        max(bbox[3], tl.y1)
+                    )
+                    del tls_search_space[i]
+        if len(tls_in_bbox) > MINIMUM_TEXTLINES_IN_TABLE:
+            return bbox
+        else:
+            print(f"Only {len(tls_in_bbox)}, that's not enough.")
+            return None
+
+    def generate(self, textlines):
+        """Generate the text edge dictionaries based on the
+        input textlines.
+        """
+        self._register_all_text_lines(textlines)
+        self._compute_alignment_counts()
+
+    def plotFRHAlignments(self, table, plt):
+        """Displays a visualization of the alignments as currently computed.
+        """
+        fig = plt.figure()
+        ax = fig.add_subplot(111, aspect="equal")
+        img = table.get_pdf_image()
+        ax.imshow(img, extent=(0, table.pdf_size[0], 0, table.pdf_size[1]))
+
+        tls_by_alignment_score = sorted(
+            self._textlines_alignments.keys(),
+            key=lambda textline:
+            self._textlines_alignments[textline].alignment_score(),
+            reverse=True
+        )
+
+        for tl, alignments in self._textlines_alignments.items():
+            color = "red"
+            if tl == tls_by_alignment_score[0]:
+                color = "blue"
+            ax.add_patch(
+                patches.Rectangle(
+                    (tl.x0, tl.y0),
+                    tl.x1 - tl.x0, tl.y1 - tl.y0,
+                    color=color,
+                    alpha=0.5
+                )
+            )
+            ax.text(
+                tl.x0 - 5,
+                tl.y0 - 5,
+                f"{alignments.max_h()}x{alignments.max_v()}",
+                fontsize=5,
+                color="black"
+            )
+
+    def plotFRHTableSearch(self, plot, debug_info):
+        if debug_info is None:
+            return
+        for box_id, bbox_search in enumerate(debug_info["bboxes_searches"]):
+            max_h_gap = bbox_search["max_h_gap"]
+            max_v_gap = bbox_search["max_v_gap"]
+            iterations = bbox_search["iterations"]
+            for iteration, bbox in enumerate(iterations):
+                final = iteration == len(iterations) - 1
+                plot.add_patch(
+                    patches.Rectangle(
+                        (bbox[0], bbox[1]),
+                        bbox[2] - bbox[0], bbox[3] - bbox[1],
+                        color="red",
+                        linewidth=5 if final else 2,
+                        fill=False
+                    )
+                )
+                plot.text(
+                    bbox[0],
+                    bbox[1],
+                    f"box #{box_id+1} / iter #{iteration}",
+                    fontsize=12,
+                    color="black",
+                    verticalalignment="top",
+                    bbox=dict(facecolor="orange", alpha=0.5)
+                )
+
+                plot.add_patch(
+                    patches.Rectangle(
+                        (bbox[0]-max_h_gap, bbox[1]-max_v_gap),
+                        bbox[2] - bbox[0] + 2 * max_h_gap,
+                        bbox[3] - bbox[1] + 2 * max_v_gap,
+                        color="orange",
+                        fill=False
+                    )
+                )


 class Hybrid(BaseParser):
@ -78,6 +683,7 @@ class Hybrid(BaseParser):
        self.row_tol = row_tol
        self.column_tol = column_tol

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _text_bbox(t_bbox):
        """Returns bounding box for the text present on a page.
@ -101,6 +707,7 @@ class Hybrid(BaseParser):
        text_bbox = (xmin, ymin, xmax, ymax)
        return text_bbox

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _group_rows(text, row_tol=2):
        """Groups PDFMiner text objects into rows vertically
@ -139,6 +746,7 @@ class Hybrid(BaseParser):
        rows.append(sorted(temp, key=lambda t: t.x0))
        return rows

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _merge_columns(l, column_tol=0):
        """Merges column boundaries horizontally if they overlap
@ -184,6 +792,7 @@ class Hybrid(BaseParser):
                        merged.append(higher)
        return merged

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _join_rows(rows_grouped, text_y_max, text_y_min):
        """Makes row coordinates continuous. For the row to "touch"
@ -217,6 +826,7 @@ class Hybrid(BaseParser):
        row_boundaries[-1][1] = text_y_min
        return row_boundaries

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _add_columns(cols, text, row_tol):
        """Adds columns to existing list by taking into account
@ -247,6 +857,7 @@ class Hybrid(BaseParser):
            cols.extend(Hybrid._merge_columns(sorted(new_cols)))
        return cols

+    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
    def _join_columns(cols, text_x_min, text_x_max):
        """Makes column coordinates continuous.
@ -271,66 +882,81 @@ class Hybrid(BaseParser):
        cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)]
        return cols

+    # FRHTODO: Check is needed, refactor with Stream
    def _validate_columns(self):
        if self.table_areas is not None and self.columns is not None:
            if len(self.table_areas) != len(self.columns):
                raise ValueError("Length of table_areas and columns"
                                 " should be equal")

-    def _nurminen_table_detection(self, textlines):
-        """A general implementation of the table detection algorithm
-        described by Anssi Nurminen's master's thesis.
-        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 # noqa
-
-        Assumes that tables are situated relatively far apart
-        vertically.
-        """
-        # TODO: add support for arabic text #141
-        # sort textlines in reading order
-        textlines.sort(key=lambda x: (-x.y0, x.x0))
-        textedges = TextEdges(edge_tol=self.edge_tol)
-        # generate left, middle and right textedges
-        textedges.generate(textlines)
-        # select relevant edges
-        relevant_textedges = textedges.get_relevant()
-        self.textedges.extend(relevant_textedges)
-        # guess table areas using textlines and relevant edges
-        table_bbox = textedges.get_table_areas(textlines, relevant_textedges)
-        # treat whole page as table area if no table areas found
-        if not table_bbox:
-            table_bbox = {(0, 0, self.pdf_width, self.pdf_height): None}
-
-        return table_bbox
-
-    def _generate_table_bbox(self):
-        self.textedges = []
-        if self.table_areas is None:
-            hor_text = self.horizontal_text
-            if self.table_regions is not None:
-                # filter horizontal text
-                hor_text = []
-                for region in self.table_regions:
-                    x1, y1, x2, y2 = region.split(",")
-                    x1 = float(x1)
-                    y1 = float(y1)
-                    x2 = float(x2)
-                    y2 = float(y2)
-                    region_text = text_in_bbox(
-                        (x1, y2, x2, y1), self.horizontal_text)
-                    hor_text.extend(region_text)
-            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(hor_text)
+    # FRHTODO: get debug_info to work again
+    def _generate_table_bbox(self, debug_info=None):
+        textlines = self.horizontal_text + self.vertical_text
+        textlines_processed = {}
+        self.table_bbox = {}
+        if debug_info is not None:
+            debug_info_bbox_searches = []
+            debug_info["bboxes_searches"] = debug_info_bbox_searches
        else:
-            table_bbox = {}
-            for area in self.table_areas:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = float(x1)
-                y1 = float(y1)
-                x2 = float(x2)
-                y2 = float(y2)
-                table_bbox[(x1, y2, x2, y1)] = None
-        self.table_bbox = table_bbox
+            debug_info_bbox_searches = None

+        while True:
+            self.textedges = TextEdges2()
+            self.textedges.generate(textlines)
+            self.textedges._remove_unconnected_edges()
+            bbox = self.textedges._build_bbox_candidate(
+                debug_info_bbox_searches
+            )
+            if bbox is None:
+                break
+
+            # Get all the textlines that are at least 50% in the box
+            tls_in_bbox = text_in_bbox(bbox, textlines)
+
+            # and expand the text box to fully contain them
+            bbox = bbox_from_text(tls_in_bbox)
+
+            # FRH: do we need to repeat this?
+            # tls_in_bbox = text_in_bbox(bbox, textlines)
+            cols_anchors = find_columns_coordinates(tls_in_bbox)
+
+            # Apply a heuristic to salvage headers which formatting might be
+            # off compared to the rest of the table.
+
+            # Calculate the average height of each textline
+            # FRHTODO: reuse the gap threshold from earlier?
+            alignments = self.textedges._textlines_alignments.keys()
+            average_tl_height = sum(
+                map(
+                    lambda tl: tl.y1 - tl.y0,
+                    alignments
+                )) / len(alignments)
+
+            expanded_bbox = todo_move_me_expand_area_for_header(
+                bbox,
+                textlines,
+                cols_anchors,
+                average_tl_height
+            )
+
+            if debug_info is not None:
+                debug_info["col_searches"].append({
+                    "core_bbox": bbox,
+                    "cols_anchors": cols_anchors,
+                    "expanded_bbox": expanded_bbox
+                })
+
+            self.table_bbox[expanded_bbox] = None
+
+            # Remember what textlines we processed, and repeat
+            for tl in tls_in_bbox:
+                textlines_processed[tl] = None
+            textlines = list(filter(
+                lambda tl: tl not in textlines_processed,
+                textlines
+            ))
+
+    # FRHTODO: Check is needed, refactor with Stream
    def _generate_columns_and_rows(self, table_idx, tk):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
@ -408,6 +1034,7 @@ class Hybrid(BaseParser):

        return cols, rows

+    # FRHTODO: Check is needed, refactor with Stream
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
@ -420,7 +1047,7 @@ class Hybrid(BaseParser):

        return table

-    def extract_tables(self):
+    def extract_tables(self, debug_info=None):
        if self._document_has_no_text():
            return []

--- a/tests/data.py
+++ b/tests/data.py
@ -823,6 +823,10 @@ data_stream_table_rotated = [
    ],
 ]

+# The streaming algorithm incorrectly includes a header in the result.
+# Trimming the table for the test of hybrid, which doesn't include it.
+data_hybrid_table_rotated = data_stream_table_rotated[1:]
+
 data_stream_two_tables_1 = [
    [
        "Program. Represents arrests reported (not charged) by 12,910 "
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -163,7 +163,7 @@ def test_hybrid():


 def test_hybrid_table_rotated():
-    df = pd.DataFrame(data_stream_table_rotated)
+    df = pd.DataFrame(data_hybrid_table_rotated)

    filename = os.path.join(testdir, "clockwise_table_2.pdf")
    tables = camelot.read_pdf(filename, flavor="hybrid")