Prep work for new hybrid parser introduction

Refactor parsers by moving common code to the base class Maintain Python 3.5 compatibility by removing f"{}"
2020-04-19 11:32:22 -07:00 · 2020-04-19 11:32:22 -07:00 · 583868756a
parent 697289e409
commit 583868756a
5 changed files with 106 additions and 47 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -413,7 +413,7 @@ class Table(object):
        }
        return report

-    def record_metadata(self, parser):
+    def record_parse_metadata(self, parser):
        """Record data about the origin of the table
        """
        self.flavor = parser.id
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -3,7 +3,8 @@
 import os

 from ..utils import (
-    get_text_objects
+    get_text_objects,
+    get_table_index
 )
 from ..core import Table

@ -11,8 +12,26 @@ from ..core import Table
 class BaseParser(object):
    """Defines a base parser.
    """
-    def __init__(self, parser_id):
+    def __init__(self,
+        parser_id,
+        table_regions=None,
+        table_areas=None,
+        split_text=False,
+        strip_text="",
+        shift_text=None,
+        flag_size=False,
+    ):
        self.id = parser_id
+        self.table_regions = table_regions
+        self.table_areas = table_areas
+
+        self.split_text = split_text
+        self.strip_text = strip_text
+        self.shift_text = shift_text
+
+        self.flag_size = flag_size
+
+        self.t_bbox = None

        # For plotting details of parsing algorithms
        self.debug_info = {}
@ -57,3 +76,38 @@ class BaseParser(object):
        table.page = self.page
        table.order = table_idx + 1
        return table
+
+
+    @staticmethod
+    def _reduce_index(t, idx, shift_text):
+        """Reduces index of a text object if it lies within a spanning
+        cell.  Only useful for some parsers (e.g. Lattice), base method is a
+        noop.
+        """
+        return idx
+
+    def _compute_parse_errors(self, table):
+        pos_errors = []
+        # TODO: have a single list in place of two directional ones?
+        # sorted on x-coordinate based on reading order i.e. LTR or RTL
+        for direction in ["vertical", "horizontal"]:
+            for t in self.t_bbox[direction]:
+                indices, error = get_table_index(
+                    table,
+                    t,
+                    direction,
+                    split_text=self.split_text,
+                    flag_size=self.flag_size,
+                    strip_text=self.strip_text,
+                )
+                if indices[:2] != (-1, -1):
+                    pos_errors.append(error)
+                    indices = type(self)._reduce_index(
+                        table,
+                        indices,
+                        shift_text=self.shift_text
+                    )
+                    for r_idx, c_idx, text in indices:
+                        table.cells[r_idx][c_idx].text = text
+        return pos_errors
+
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -114,16 +114,18 @@ class Lattice(BaseParser):
        resolution=300,
        **kwargs
    ):
-        super().__init__("lattice")
-        self.table_regions = table_regions
-        self.table_areas = table_areas
+        super().__init__(
+            "lattice",
+            table_regions=table_regions,
+            table_areas=table_areas,
+            split_text=split_text,
+            strip_text=strip_text,
+            shift_text=shift_text or ["l", "t"],
+            flag_size=flag_size,
+        )
        self.process_background = process_background
        self.line_scale = line_scale
        self.copy_text = copy_text
-        self.shift_text = shift_text or ["l", "t"]
-        self.split_text = split_text
-        self.flag_size = flag_size
-        self.strip_text = strip_text
        self.line_tol = line_tol
        self.joint_tol = joint_tol
        self.threshold_blocksize = threshold_blocksize
@ -178,6 +180,7 @@ class Lattice(BaseParser):
            indices.append((r_idx, c_idx, text))
        return indices

+
    @staticmethod
    def _copy_spanning_text(t, copy_text=None):
        """Copies over text in empty spanning cells.
@ -368,7 +371,7 @@ class Lattice(BaseParser):
                copy_text=self.copy_text
            )

-        table.record_metadata(self)
+        table.record_parse_metadata(self)
        table.accuracy = accuracy

        # for plotting
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -10,7 +10,7 @@ import pandas as pd

 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
+from ..utils import (text_in_bbox, compute_accuracy,
                     compute_whitespace)


@ -61,22 +61,24 @@ class Stream(BaseParser):
        table_regions=None,
        table_areas=None,
        columns=None,
-        split_text=False,
        flag_size=False,
+        split_text=False,
        strip_text="",
        edge_tol=50,
        row_tol=2,
        column_tol=0,
        **kwargs
    ):
-        super().__init__("stream")
-        self.table_regions = table_regions
-        self.table_areas = table_areas
+        super().__init__(
+            "stream",
+            table_regions=table_regions,
+            table_areas=table_areas,
+            split_text=split_text,
+            strip_text=strip_text,
+            flag_size=flag_size,
+        )
        self.columns = columns
        self._validate_columns()
-        self.split_text = split_text
-        self.flag_size = flag_size
-        self.strip_text = strip_text
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
@ -418,26 +420,10 @@ class Stream(BaseParser):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()

-        pos_errors = []
-        # TODO: have a single list in place of two directional ones?
-        # sorted on x-coordinate based on reading order i.e. LTR or RTL
-        for direction in ["vertical", "horizontal"]:
-            for t in self.t_bbox[direction]:
-                indices, error = get_table_index(
-                    table,
-                    t,
-                    direction,
-                    split_text=self.split_text,
-                    flag_size=self.flag_size,
-                    strip_text=self.strip_text,
-                )
-                if indices[:2] != (-1, -1):
-                    pos_errors.append(error)
-                    for r_idx, c_idx, text in indices:
-                        table.cells[r_idx][c_idx].text = text
+        pos_errors = self._compute_parse_errors(table)
        accuracy = compute_accuracy([[100, pos_errors]])

-        table.record_metadata(self)
+        table.record_parse_metadata(self)

        table.accuracy = accuracy

--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path):
    pdf_path : str
    destination_path : str
    """
-    gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
+    gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
+        .format(
+            destination_path=destination_path,
+            pdf_path=pdf_path
+        )
    gs_call = gs_call.encode().split()
    null = open(os.devnull, "wb")
    Ghostscript(*gs_call, stdout=null)
@ -1038,19 +1042,28 @@ def compare_tables(left, right):
    differences = []
    if (diff_rows):
        differences.append(
-            f"{abs(diff_rows)} "
-            f"{'more' if diff_rows>0 else 'fewer'} rows"
+            "{diff_rows} {more_fewer} rows".format(
+                diff_rows=abs(diff_rows),
+                more_fewer='more' if diff_rows>0 else 'fewer'
+            )
        )
    if (diff_cols):
        differences.append(
-            f"{abs(diff_cols)} "
-            f"{'more' if diff_cols>0 else 'fewer'} columns"
+            "{diff_cols} {more_fewer} columns".format(
+                diff_cols=abs(diff_cols),
+                more_fewer='more' if diff_cols>0 else 'fewer'
+            )
        )
    if differences:
        differences_str = " and ".join(differences)
-        print(f"Right has {differences_str} than left "
-              f"[{right.shape[0]},{right.shape[1]}] vs "
-              f"[{left.shape[0]},{left.shape[1]}]")
+        print(
+            "Right has {differences_str} than left "
+            "{shape_right} vs {shape_left}".format(
+                differences_str=differences_str,
+                shape_right=[right.shape[0], right.shape[1]],
+                shape_left=[left.shape[0], left.shape[1]]
+            )
+        )

    table1, table2 = [left, right]
    name_table1, name_table2 = ["left", "right"]
@ -1070,8 +1083,11 @@ def compare_tables(left, right):
                    diff_df[name_table2] = lcol
                    diff_df["Match"] = lcol == scol
                    print(
-                        f"Column {i} different:\n"
-                        f"{diff_df}"
+                        "Column {i} different:\n"
+                        "{diff_df}".format(
+                            i=i,
+                            diff_df=diff_df
+                        )
                    )
                    break
            else: