From 18581640be206161bac893c25d64e2069a81924c Mon Sep 17 00:00:00 2001
From: Frh <francois.huet+github@gmail.com>
Date: Fri, 24 Apr 2020 15:54:58 -0700
Subject: [PATCH] Common parent TextBaseParser for Stream and Hybrid

---
 camelot/core.py            |  2 +-
 camelot/parsers/base.py    | 59 ++++++++++++++++++++++++++++++++++++++
 camelot/parsers/hybrid.py  | 46 +++++++----------------------
 camelot/parsers/lattice.py | 31 ++++----------------
 camelot/parsers/stream.py  | 41 +++++++-------------------
 5 files changed, 87 insertions(+), 92 deletions(-)

diff --git a/camelot/core.py b/camelot/core.py
index 440b2c9..9921b95 100644
--- a/camelot/core.py
+++ b/camelot/core.py
@@ -136,7 +136,7 @@ class TextEdge(TextAlignment):
 
 
 class TextAlignments(object):
-    """Defines a dict of text edges accross alignment references.
+    """Defines a dict of text edges across reference alignments.
     """
 
     def __init__(self, alignment_names):
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index 9e76c7b..e4b5071 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -169,3 +169,62 @@ class BaseParser(object):
                     for r_idx, c_idx, text in indices:
                         table.cells[r_idx][c_idx].text = text
         return pos_errors
+
+    def extract_tables(self):
+        if self._document_has_no_text():
+            return []
+
+        # Identify plausible areas within the doc where tables lie,
+        # populate table_bbox keys with these areas.
+        self._generate_table_bbox()
+
+        _tables = []
+        # sort tables based on y-coord
+        for table_idx, bbox in enumerate(
+            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
+        ):
+            cols, rows, v_s, h_s = self._generate_columns_and_rows(
+                bbox,
+                table_idx
+            )
+            table = self._generate_table(
+                table_idx, cols, rows, v_s=v_s, h_s=h_s)
+            table._bbox = bbox
+            _tables.append(table)
+
+        return _tables
+
+
+class TextBaseParser(BaseParser):
+    """Base class for all text parsers.
+    """
+
+    def __init__(
+        self,
+        parser_id,
+        table_regions=None,
+        table_areas=None,
+        columns=None,
+        flag_size=False,
+        split_text=False,
+        strip_text="",
+        edge_tol=50,
+        row_tol=2,
+        column_tol=0,
+        **kwargs
+    ):
+        super().__init__(
+            "stream",
+            table_regions=table_regions,
+            table_areas=table_areas,
+            split_text=split_text,
+            strip_text=strip_text,
+            flag_size=flag_size,
+        )
+        self.columns = columns
+        self._validate_columns()
+        self.edge_tol = edge_tol
+        self.row_tol = row_tol
+        self.column_tol = column_tol
+
+        self.textedges = None
diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py
index 869e2d2..898cfc0 100644
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@@ -7,7 +7,7 @@ import numpy as np
 import copy
 import warnings
 
-from .base import BaseParser
+from .base import TextBaseParser
 from ..core import (
     TextAlignments,
     ALL_ALIGNMENTS,
@@ -257,7 +257,7 @@ class TextNetworks(TextAlignments):
         for align_id in self._textedges:
             edge_array = self._textedges[align_id]
             gaps = []
-            vertical = align_id in ["left", "right", "middle"]
+            vertical = align_id in HORIZONTAL_ALIGNMENTS
             sort_function = (lambda tl: tl.y0) \
                 if vertical \
                 else (lambda tl: tl.x0)
@@ -491,7 +491,7 @@ class TextNetworks(TextAlignments):
             )
 
 
-class Hybrid(BaseParser):
+class Hybrid(TextBaseParser):
     """Hybrid method of parsing looks for spaces between text
     to parse the table.
 
@@ -548,18 +548,14 @@ class Hybrid(BaseParser):
             "hybrid",
             table_regions=table_regions,
             table_areas=table_areas,
+            columns=columns,
+            flag_size=flag_size,
             split_text=split_text,
             strip_text=strip_text,
-            flag_size=flag_size,
-            debug=debug
+            edge_tol=edge_tol,
+            row_tol=row_tol,
+            column_tol=column_tol,
         )
-        self.columns = columns
-        self.textedges = None
-
-        self._validate_columns()
-        self.edge_tol = edge_tol
-        self.row_tol = row_tol
-        self.column_tol = column_tol
 
     # FRHTODO: Check if needed, refactor with Stream
     @staticmethod
@@ -832,10 +828,10 @@ class Hybrid(BaseParser):
             ))
 
     # FRHTODO: Check is needed, refactor with Stream
-    def _generate_columns_and_rows(self, table_idx, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
         # select elements which lie within table_bbox
         self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
             self.horizontal_text,
             self.vertical_text
         )
@@ -908,7 +904,7 @@ class Hybrid(BaseParser):
             cols = self._add_columns(cols, inner_text, self.row_tol)
             cols = self._join_columns(cols, text_x_min, text_x_max)
 
-        return cols, rows
+        return cols, rows, None, None
 
     # FRHTODO: Check is needed, refactor with Stream
     def _generate_table(self, table_idx, cols, rows, **kwargs):
@@ -922,23 +918,3 @@ class Hybrid(BaseParser):
         table._textedges = self.textedges
 
         return table
-
-    def extract_tables(self):
-        if self._document_has_no_text():
-            return []
-
-        # Identify plausible areas within the doc where tables lie,
-        # populate table_bbox keys with these areas.
-        self._generate_table_bbox()
-
-        _tables = []
-        # sort tables based on y-coord
-        for table_idx, bbox in enumerate(
-            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
-        ):
-            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
-            table = self._generate_table(table_idx, cols, rows)
-            table._bbox = bbox
-            _tables.append(table)
-
-        return _tables
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index 2e62846..d6ba65d 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -252,21 +252,21 @@ class Lattice(BaseParser):
             table_bbox, vertical_segments, horizontal_segments, pdf_scalers
         )
 
-    def _generate_columns_and_rows(self, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
         # select elements which lie within table_bbox
         v_s, h_s = segments_in_bbox(
-            tk, self.vertical_segments, self.horizontal_segments
+            bbox, self.vertical_segments, self.horizontal_segments
         )
         self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
             self.horizontal_text,
             self.vertical_text
             )
 
-        cols, rows = zip(*self.table_bbox[tk])
+        cols, rows = zip(*self.table_bbox[bbox])
         cols, rows = list(cols), list(rows)
-        cols.extend([tk[0], tk[2]])
-        rows.extend([tk[1], tk[3]])
+        cols.extend([bbox[0], bbox[2]])
+        rows.extend([bbox[1], bbox[3]])
         # sort horizontal and vertical segments
         cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
         rows = merge_close_lines(
@@ -302,22 +302,3 @@ class Lattice(BaseParser):
         table._textedges = None
 
         return table
-
-    def extract_tables(self):
-        if self._document_has_no_text():
-            return []
-
-        self._generate_table_bbox()
-
-        _tables = []
-        # sort tables based on y-coord
-        for table_idx, tk in enumerate(
-            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
-        ):
-            cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
-            table = self._generate_table(
-                table_idx, cols, rows, v_s=v_s, h_s=h_s)
-            table._bbox = tk
-            _tables.append(table)
-
-        return _tables
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 08aaa41..91e2fde 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -5,7 +5,7 @@ import warnings
 
 import numpy as np
 
-from .base import BaseParser
+from .base import TextBaseParser
 from ..core import TextEdges
 from ..utils import (
     bbox_from_str,
@@ -15,7 +15,7 @@ from ..utils import (
 )
 
 
-class Stream(BaseParser):
+class Stream(TextBaseParser):
     """Stream method of parsing looks for spaces between text
     to parse the table.
 
@@ -71,15 +71,14 @@ class Stream(BaseParser):
             "stream",
             table_regions=table_regions,
             table_areas=table_areas,
+            columns=columns,
+            flag_size=flag_size,
             split_text=split_text,
             strip_text=strip_text,
-            flag_size=flag_size,
+            edge_tol=edge_tol,
+            row_tol=row_tol,
+            column_tol=column_tol,
         )
-        self.columns = columns
-        self._validate_columns()
-        self.edge_tol = edge_tol
-        self.row_tol = row_tol
-        self.column_tol = column_tol
 
     @staticmethod
     def _group_rows(text, row_tol=2):
@@ -302,10 +301,10 @@ class Stream(BaseParser):
                 table_bbox[bbox_from_str(area_str)] = None
         self.table_bbox = table_bbox
 
-    def _generate_columns_and_rows(self, table_idx, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
         # select elements which lie within table_bbox
         self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
             self.horizontal_text,
             self.vertical_text
         )
@@ -378,7 +377,7 @@ class Stream(BaseParser):
             cols = self._add_columns(cols, inner_text, self.row_tol)
             cols = self._join_columns(cols, text_x_min, text_x_max)
 
-        return cols, rows
+        return cols, rows, None, None
 
     def _generate_table(self, table_idx, cols, rows, **kwargs):
         table = self._initialize_new_table(table_idx, cols, rows)
@@ -391,23 +390,3 @@ class Stream(BaseParser):
         table._textedges = self.textedges
 
         return table
-
-    def extract_tables(self):
-        if self._document_has_no_text():
-            return []
-
-        # Identify plausible areas within the doc where tables lie,
-        # populate table_bbox keys with these areas.
-        self._generate_table_bbox()
-
-        _tables = []
-        # sort tables based on y-coord
-        for table_idx, bbox in enumerate(
-            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
-        ):
-            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
-            table = self._generate_table(table_idx, cols, rows)
-            table._bbox = bbox
-            _tables.append(table)
-
-        return _tables