Common parent TextBaseParser for Stream and Hybrid

2020-04-24 15:54:58 -07:00 · 2020-04-24 15:54:58 -07:00 · 18581640be
parent a401d33fd9
commit 18581640be
5 changed files with 87 additions and 92 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -136,7 +136,7 @@ class TextEdge(TextAlignment):
 class TextAlignments(object):
-    """Defines a dict of text edges accross alignment references.
+    """Defines a dict of text edges across reference alignments.
    """
    def __init__(self, alignment_names):
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -169,3 +169,62 @@ class BaseParser(object):
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        return pos_errors
    def extract_tables(self):
        if self._document_has_no_text():
            return []
        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, bbox in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows, v_s, h_s = self._generate_columns_and_rows(
                bbox,
                table_idx
            )
            table = self._generate_table(
                table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = bbox
            _tables.append(table)
        return _tables
 class TextBaseParser(BaseParser):
    """Base class for all text parsers.
    """
    def __init__(
        self,
        parser_id,
        table_regions=None,
        table_areas=None,
        columns=None,
        flag_size=False,
        split_text=False,
        strip_text="",
        edge_tol=50,
        row_tol=2,
        column_tol=0,
        **kwargs
    ):
        super().__init__(
            "stream",
            table_regions=table_regions,
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            flag_size=flag_size,
        )
        self.columns = columns
        self._validate_columns()
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
        self.textedges = None
--- a/camelot/parsers/hybrid.py
+++ b/camelot/parsers/hybrid.py
@ -7,7 +7,7 @@ import numpy as np
 import copy
 import warnings
-from .base import BaseParser
+from .base import TextBaseParser
 from ..core import (
    TextAlignments,
    ALL_ALIGNMENTS,
@ -257,7 +257,7 @@ class TextNetworks(TextAlignments):
        for align_id in self._textedges:
            edge_array = self._textedges[align_id]
            gaps = []
-            vertical = align_id in ["left", "right", "middle"]
+            vertical = align_id in HORIZONTAL_ALIGNMENTS
            sort_function = (lambda tl: tl.y0) \
                if vertical \
                else (lambda tl: tl.x0)
@ -491,7 +491,7 @@ class TextNetworks(TextAlignments):
            )
-class Hybrid(BaseParser):
+class Hybrid(TextBaseParser):
    """Hybrid method of parsing looks for spaces between text
    to parse the table.
@ -548,18 +548,14 @@ class Hybrid(BaseParser):
            "hybrid",
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
-            flag_size=flag_size,
+            edge_tol=edge_tol,
-            debug=debug
+            row_tol=row_tol,
            column_tol=column_tol,
        )
        self.columns = columns
        self.textedges = None
        self._validate_columns()
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
    # FRHTODO: Check if needed, refactor with Stream
    @staticmethod
@ -832,10 +828,10 @@ class Hybrid(BaseParser):
            ))
    # FRHTODO: Check is needed, refactor with Stream
-    def _generate_columns_and_rows(self, table_idx, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
            self.horizontal_text,
            self.vertical_text
        )
@ -908,7 +904,7 @@ class Hybrid(BaseParser):
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
-        return cols, rows
+        return cols, rows, None, None
    # FRHTODO: Check is needed, refactor with Stream
    def _generate_table(self, table_idx, cols, rows, **kwargs):
@ -922,23 +918,3 @@ class Hybrid(BaseParser):
        table._textedges = self.textedges
        return table
    def extract_tables(self):
        if self._document_has_no_text():
            return []
        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, bbox in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
            table = self._generate_table(table_idx, cols, rows)
            table._bbox = bbox
            _tables.append(table)
        return _tables
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -252,21 +252,21 @@ class Lattice(BaseParser):
            table_bbox, vertical_segments, horizontal_segments, pdf_scalers
        )
-    def _generate_columns_and_rows(self, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        v_s, h_s = segments_in_bbox(
-            tk, self.vertical_segments, self.horizontal_segments
+            bbox, self.vertical_segments, self.horizontal_segments
        )
        self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
            self.horizontal_text,
            self.vertical_text
            )
-        cols, rows = zip(*self.table_bbox[tk])
+        cols, rows = zip(*self.table_bbox[bbox])
        cols, rows = list(cols), list(rows)
-        cols.extend([tk[0], tk[2]])
+        cols.extend([bbox[0], bbox[2]])
-        rows.extend([tk[1], tk[3]])
+        rows.extend([bbox[1], bbox[3]])
        # sort horizontal and vertical segments
        cols = merge_close_lines(sorted(cols), line_tol=self.line_tol)
        rows = merge_close_lines(
@ -302,22 +302,3 @@ class Lattice(BaseParser):
        table._textedges = None
        return table
    def extract_tables(self):
        if self._document_has_no_text():
            return []
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, tk in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows, v_s, h_s = self._generate_columns_and_rows(tk)
            table = self._generate_table(
                table_idx, cols, rows, v_s=v_s, h_s=h_s)
            table._bbox = tk
            _tables.append(table)
        return _tables
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -5,7 +5,7 @@ import warnings
 import numpy as np
-from .base import BaseParser
+from .base import TextBaseParser
 from ..core import TextEdges
 from ..utils import (
    bbox_from_str,
@ -15,7 +15,7 @@ from ..utils import (
 )
-class Stream(BaseParser):
+class Stream(TextBaseParser):
    """Stream method of parsing looks for spaces between text
    to parse the table.
@ -71,15 +71,14 @@ class Stream(BaseParser):
            "stream",
            table_regions=table_regions,
            table_areas=table_areas,
            columns=columns,
            flag_size=flag_size,
            split_text=split_text,
            strip_text=strip_text,
-            flag_size=flag_size,
+            edge_tol=edge_tol,
            row_tol=row_tol,
            column_tol=column_tol,
        )
        self.columns = columns
        self._validate_columns()
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.column_tol = column_tol
    @staticmethod
    def _group_rows(text, row_tol=2):
@ -302,10 +301,10 @@ class Stream(BaseParser):
                table_bbox[bbox_from_str(area_str)] = None
        self.table_bbox = table_bbox
-    def _generate_columns_and_rows(self, table_idx, tk):
+    def _generate_columns_and_rows(self, bbox, table_idx):
        # select elements which lie within table_bbox
        self.t_bbox = text_in_bbox_per_axis(
-            tk,
+            bbox,
            self.horizontal_text,
            self.vertical_text
        )
@ -378,7 +377,7 @@ class Stream(BaseParser):
            cols = self._add_columns(cols, inner_text, self.row_tol)
            cols = self._join_columns(cols, text_x_min, text_x_max)
-        return cols, rows
+        return cols, rows, None, None
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
@ -391,23 +390,3 @@ class Stream(BaseParser):
        table._textedges = self.textedges
        return table
    def extract_tables(self):
        if self._document_has_no_text():
            return []
        # Identify plausible areas within the doc where tables lie,
        # populate table_bbox keys with these areas.
        self._generate_table_bbox()
        _tables = []
        # sort tables based on y-coord
        for table_idx, bbox in enumerate(
            sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
        ):
            cols, rows = self._generate_columns_and_rows(table_idx, bbox)
            table = self._generate_table(table_idx, cols, rows)
            table._bbox = bbox
            _tables.append(table)
        return _tables