From cff7a9698b6462d140979d722fe738a0f16dbe3a Mon Sep 17 00:00:00 2001
From: Frh <francois.huet+github@gmail.com>
Date: Sun, 19 Apr 2020 13:28:17 -0700
Subject: [PATCH] Further refactor

Move common parse error stats computation to base parser
Move copy_spanning_text logic to the table
---
 camelot/core.py            | 37 +++++++++++++++++++++
 camelot/parsers/base.py    |  4 ++-
 camelot/parsers/lattice.py | 66 +-------------------------------------
 camelot/parsers/stream.py  |  6 ----
 4 files changed, 41 insertions(+), 72 deletions(-)

diff --git a/camelot/core.py b/camelot/core.py
index 8f7cbaf..94d49e9 100644
--- a/camelot/core.py
+++ b/camelot/core.py
@@ -419,6 +419,12 @@ class Table(object):
         self.flavor = parser.id
         self.filename = parser.filename
         self.debug_info = parser.debug_info
+        pos_errors = parser.compute_parse_errors(self)
+        self.accuracy = compute_accuracy([[100, pos_errors]])
+
+        if parser.copy_text is not None:
+            self.copy_spanning_text(parser.copy_text)
+
         data = self.data
         self.df = pd.DataFrame(data)
         self.shape = self.df.shape
@@ -712,6 +718,37 @@ class Table(object):
         conn.commit()
         conn.close()
 
+    def copy_spanning_text(self, copy_text=None):
+        """Copies over text in empty spanning cells.
+
+        Parameters
+        ----------
+        copy_text : list, optional (default: None)
+            {'h', 'v'}
+            Select one or more strings from above and pass them as a list
+            to specify the direction in which text should be copied over
+            when a cell spans multiple rows or columns.
+
+        Returns
+        -------
+        t : camelot.core.Table
+
+        """
+        for f in copy_text:
+            if f == "h":
+                for i in range(len(self.cells)):
+                    for j in range(len(self.cells[i])):
+                        if self.cells[i][j].text.strip() == "":
+                            if self.cells[i][j].hspan and not self.cells[i][j].left:
+                                self.cells[i][j].text = self.cells[i][j - 1].text
+            elif f == "v":
+                for i in range(len(self.cells)):
+                    for j in range(len(self.cells[i])):
+                        if self.cells[i][j].text.strip() == "":
+                            if self.cells[i][j].vspan and not self.cells[i][j].top:
+                                self.cells[i][j].text = self.cells[i - 1][j].text
+        return self
+
 
 class TableList(object):
     """Defines a list of camelot.core.Table objects. Each table can
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
index c50a164..19deceb 100644
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@@ -16,6 +16,7 @@ class BaseParser(object):
         parser_id,
         table_regions=None,
         table_areas=None,
+        copy_text=None,
         split_text=False,
         strip_text="",
         shift_text=None,
@@ -25,6 +26,7 @@ class BaseParser(object):
         self.table_regions = table_regions
         self.table_areas = table_areas
 
+        self.copy_text = copy_text
         self.split_text = split_text
         self.strip_text = strip_text
         self.shift_text = shift_text
@@ -86,7 +88,7 @@ class BaseParser(object):
         """
         return idx
 
-    def _compute_parse_errors(self, table):
+    def compute_parse_errors(self, table):
         pos_errors = []
         # TODO: have a single list in place of two directional ones?
         # sorted on x-coordinate based on reading order i.e. LTR or RTL
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index c294b55..c0f3e9b 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -120,12 +120,12 @@ class Lattice(BaseParser):
             table_areas=table_areas,
             split_text=split_text,
             strip_text=strip_text,
+            copy_text=copy_text,
             shift_text=shift_text or ["l", "t"],
             flag_size=flag_size,
         )
         self.process_background = process_background
         self.line_scale = line_scale
-        self.copy_text = copy_text
         self.line_tol = line_tol
         self.joint_tol = joint_tol
         self.threshold_blocksize = threshold_blocksize
@@ -180,40 +180,6 @@ class Lattice(BaseParser):
             indices.append((r_idx, c_idx, text))
         return indices
 
-
-    @staticmethod
-    def _copy_spanning_text(t, copy_text=None):
-        """Copies over text in empty spanning cells.
-
-        Parameters
-        ----------
-        t : camelot.core.Table
-        copy_text : list, optional (default: None)
-            {'h', 'v'}
-            Select one or more strings from above and pass them as a list
-            to specify the direction in which text should be copied over
-            when a cell spans multiple rows or columns.
-
-        Returns
-        -------
-        t : camelot.core.Table
-
-        """
-        for f in copy_text:
-            if f == "h":
-                for i in range(len(t.cells)):
-                    for j in range(len(t.cells[i])):
-                        if t.cells[i][j].text.strip() == "":
-                            if t.cells[i][j].hspan and not t.cells[i][j].left:
-                                t.cells[i][j].text = t.cells[i][j - 1].text
-            elif f == "v":
-                for i in range(len(t.cells)):
-                    for j in range(len(t.cells[i])):
-                        if t.cells[i][j].text.strip() == "":
-                            if t.cells[i][j].vspan and not t.cells[i][j].top:
-                                t.cells[i][j].text = t.cells[i - 1][j].text
-        return t
-
     def _generate_table_bbox(self):
         def scale_areas(areas):
             scaled_areas = []
@@ -342,37 +308,7 @@ class Lattice(BaseParser):
         # set spanning cells to True
         table = table.set_span()
 
-        pos_errors = []
-        # TODO: have a single list in place of two directional ones?
-        # sorted on x-coordinate based on reading order i.e. LTR or RTL
-        for direction in ["vertical", "horizontal"]:
-            for t in self.t_bbox[direction]:
-                indices, error = get_table_index(
-                    table,
-                    t,
-                    direction,
-                    split_text=self.split_text,
-                    flag_size=self.flag_size,
-                    strip_text=self.strip_text,
-                )
-                if indices[:2] != (-1, -1):
-                    pos_errors.append(error)
-                    indices = Lattice._reduce_index(
-                        table, indices, shift_text=self.shift_text
-                    )
-                    for r_idx, c_idx, text in indices:
-                        table.cells[r_idx][c_idx].text = text
-        # FRHTODO
-        accuracy = compute_accuracy([[100, pos_errors]])
-
-        if self.copy_text is not None:
-            table = Lattice._copy_spanning_text(
-                table,
-                copy_text=self.copy_text
-            )
-
         table.record_parse_metadata(self)
-        table.accuracy = accuracy
 
         # for plotting
         _text = []
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 0d507c5..2df3093 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -419,14 +419,8 @@ class Stream(BaseParser):
     def _generate_table(self, table_idx, cols, rows, **kwargs):
         table = self._initialize_new_table(table_idx, cols, rows)
         table = table.set_all_edges()
-
-        pos_errors = self._compute_parse_errors(table)
-        accuracy = compute_accuracy([[100, pos_errors]])
-
         table.record_parse_metadata(self)
 
-        table.accuracy = accuracy
-
         # for plotting
         _text = []
         _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])