Prep work for new hybrid parser introduction

Refactor parsers by moving common code to the base class Maintain Python 3.5 compatibility by removing f"{}"
2020-04-19 11:32:22 -07:00 · 2020-04-19 11:32:22 -07:00 · 37483ca202
parent 161f71230d
commit 37483ca202
5 changed files with 106 additions and 47 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -413,7 +413,7 @@ class Table(object):
        }
        return report
-    def record_metadata(self, parser):
+    def record_parse_metadata(self, parser):
        """Record data about the origin of the table
        """
        self.flavor = parser.id
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -3,7 +3,8 @@
 import os
 from ..utils import (
-    get_text_objects
+    get_text_objects,
    get_table_index
 )
 from ..core import Table
@ -11,8 +12,26 @@ from ..core import Table
 class BaseParser(object):
    """Defines a base parser.
    """
-    def __init__(self, parser_id):
+    def __init__(self,
        parser_id,
        table_regions=None,
        table_areas=None,
        split_text=False,
        strip_text="",
        shift_text=None,
        flag_size=False,
    ):
        self.id = parser_id
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.split_text = split_text
        self.strip_text = strip_text
        self.shift_text = shift_text
        self.flag_size = flag_size
        self.t_bbox = None
        # For plotting details of parsing algorithms
        self.debug_info = {}
@ -57,3 +76,38 @@ class BaseParser(object):
        table.page = self.page
        table.order = table_idx + 1
        return table
    @staticmethod
    def _reduce_index(t, idx, shift_text):
        """Reduces index of a text object if it lies within a spanning
        cell.  Only useful for some parsers (e.g. Lattice), base method is a
        noop.
        """
        return idx
    def _compute_parse_errors(self, table):
        pos_errors = []
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    indices = type(self)._reduce_index(
                        table,
                        indices,
                        shift_text=self.shift_text
                    )
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -114,16 +114,18 @@ class Lattice(BaseParser):
        resolution=300,
        **kwargs
    ):
-        super().__init__("lattice")
+        super().__init__(
-        self.table_regions = table_regions
+            "lattice",
-        self.table_areas = table_areas
+            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            shift_text=shift_text or ["l", "t"],
            flag_size=flag_size,
        )
        self.process_background = process_background
        self.line_scale = line_scale
        self.copy_text = copy_text
        self.shift_text = shift_text or ["l", "t"]
        self.split_text = split_text
        self.flag_size = flag_size
        self.strip_text = strip_text
        self.line_tol = line_tol
        self.joint_tol = joint_tol
        self.threshold_blocksize = threshold_blocksize
@ -178,6 +180,7 @@ class Lattice(BaseParser):
            indices.append((r_idx, c_idx, text))
        return indices
    @staticmethod
    def _copy_spanning_text(t, copy_text=None):
        """Copies over text in empty spanning cells.
@ -368,7 +371,7 @@ class Lattice(BaseParser):
                copy_text=self.copy_text
            )
-        table.record_metadata(self)
+        table.record_parse_metadata(self)
        table.accuracy = accuracy
        # for plotting
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -10,7 +10,7 @@ import pandas as pd
 from .base import BaseParser
 from ..core import TextEdges
-from ..utils import (text_in_bbox, get_table_index, compute_accuracy,
+from ..utils import (text_in_bbox, compute_accuracy,
                     compute_whitespace)
@ -61,22 +61,24 @@ class Stream(BaseParser):
        table_regions=None,
        table_areas=None,
        columns=None,
        split_text=False,
        flag_size=False,
        split_text=False,
        strip_text="",
        edge_tol=50,
        row_tol=2,
        column_tol=0,
        **kwargs
    ):
-        super().__init__("stream")
+        super().__init__(
-        self.table_regions = table_regions
+            "stream",
-        self.table_areas = table_areas
+            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
        )
        self.columns = columns
        self._validate_columns()
        self.split_text = split_text
        self.flag_size = flag_size
        self.strip_text = strip_text
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
@ -418,26 +420,10 @@ class Stream(BaseParser):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
-        pos_errors = []
+        pos_errors = self._compute_parse_errors(table)
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        accuracy = compute_accuracy([[100, pos_errors]])
-        table.record_metadata(self)
+        table.record_parse_metadata(self)
        table.accuracy = accuracy
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path):
    pdf_path : str
    destination_path : str
    """
-    gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"
+    gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\
        .format(
            destination_path=destination_path,
            pdf_path=pdf_path
        )
    gs_call = gs_call.encode().split()
    null = open(os.devnull, "wb")
    Ghostscript(*gs_call, stdout=null)
@ -1038,19 +1042,28 @@ def compare_tables(left, right):
    differences = []
    if (diff_rows):
        differences.append(
-            f"{abs(diff_rows)} "
+            "{diff_rows} {more_fewer} rows".format(
-            f"{'more' if diff_rows>0 else 'fewer'} rows"
+                diff_rows=abs(diff_rows),
                more_fewer='more' if diff_rows>0 else 'fewer'
            )
        )
    if (diff_cols):
        differences.append(
-            f"{abs(diff_cols)} "
+            "{diff_cols} {more_fewer} columns".format(
-            f"{'more' if diff_cols>0 else 'fewer'} columns"
+                diff_cols=abs(diff_cols),
                more_fewer='more' if diff_cols>0 else 'fewer'
            )
        )
    if differences:
        differences_str = " and ".join(differences)
-        print(f"Right has {differences_str} than left "
+        print(
-              f"[{right.shape[0]},{right.shape[1]}] vs "
+            "Right has {differences_str} than left "
-              f"[{left.shape[0]},{left.shape[1]}]")
+            "{shape_right} vs {shape_left}".format(
                differences_str=differences_str,
                shape_right=[right.shape[0], right.shape[1]],
                shape_left=[left.shape[0], left.shape[1]]
            )
        )
    table1, table2 = [left, right]
    name_table1, name_table2 = ["left", "right"]
@ -1070,8 +1083,11 @@ def compare_tables(left, right):
                    diff_df[name_table2] = lcol
                    diff_df["Match"] = lcol == scol
                    print(
-                        f"Column {i} different:\n"
+                        "Column {i} different:\n"
-                        f"{diff_df}"
+                        "{diff_df}".format(
                            i=i,
                            diff_df=diff_df
                        )
                    )
                    break
            else: