Further refactor

Move common parse error stats computation to base parser Move copy_spanning_text logic to the table
2020-04-19 13:28:17 -07:00 · 2020-04-19 13:28:17 -07:00 · cff7a9698b
parent 583868756a
commit cff7a9698b
4 changed files with 41 additions and 72 deletions
--- a/camelot/core.py
+++ b/camelot/core.py
@ -419,6 +419,12 @@ class Table(object):
        self.flavor = parser.id
        self.filename = parser.filename
        self.debug_info = parser.debug_info
        pos_errors = parser.compute_parse_errors(self)
        self.accuracy = compute_accuracy([[100, pos_errors]])
        if parser.copy_text is not None:
            self.copy_spanning_text(parser.copy_text)
        data = self.data
        self.df = pd.DataFrame(data)
        self.shape = self.df.shape
@ -712,6 +718,37 @@ class Table(object):
        conn.commit()
        conn.close()
    def copy_spanning_text(self, copy_text=None):
        """Copies over text in empty spanning cells.
        Parameters
        ----------
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.
        Returns
        -------
        t : camelot.core.Table
        """
        for f in copy_text:
            if f == "h":
                for i in range(len(self.cells)):
                    for j in range(len(self.cells[i])):
                        if self.cells[i][j].text.strip() == "":
                            if self.cells[i][j].hspan and not self.cells[i][j].left:
                                self.cells[i][j].text = self.cells[i][j - 1].text
            elif f == "v":
                for i in range(len(self.cells)):
                    for j in range(len(self.cells[i])):
                        if self.cells[i][j].text.strip() == "":
                            if self.cells[i][j].vspan and not self.cells[i][j].top:
                                self.cells[i][j].text = self.cells[i - 1][j].text
        return self
 class TableList(object):
    """Defines a list of camelot.core.Table objects. Each table can
--- a/camelot/parsers/base.py
+++ b/camelot/parsers/base.py
@ -16,6 +16,7 @@ class BaseParser(object):
        parser_id,
        table_regions=None,
        table_areas=None,
        copy_text=None,
        split_text=False,
        strip_text="",
        shift_text=None,
@ -25,6 +26,7 @@ class BaseParser(object):
        self.table_regions = table_regions
        self.table_areas = table_areas
        self.copy_text = copy_text
        self.split_text = split_text
        self.strip_text = strip_text
        self.shift_text = shift_text
@ -86,7 +88,7 @@ class BaseParser(object):
        """
        return idx
-    def _compute_parse_errors(self, table):
+    def compute_parse_errors(self, table):
        pos_errors = []
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@ -120,12 +120,12 @@ class Lattice(BaseParser):
            table_areas=table_areas,
            split_text=split_text,
            strip_text=strip_text,
            copy_text=copy_text,
            shift_text=shift_text or ["l", "t"],
            flag_size=flag_size,
        )
        self.process_background = process_background
        self.line_scale = line_scale
        self.copy_text = copy_text
        self.line_tol = line_tol
        self.joint_tol = joint_tol
        self.threshold_blocksize = threshold_blocksize
@ -180,40 +180,6 @@ class Lattice(BaseParser):
            indices.append((r_idx, c_idx, text))
        return indices
    @staticmethod
    def _copy_spanning_text(t, copy_text=None):
        """Copies over text in empty spanning cells.
        Parameters
        ----------
        t : camelot.core.Table
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.
        Returns
        -------
        t : camelot.core.Table
        """
        for f in copy_text:
            if f == "h":
                for i in range(len(t.cells)):
                    for j in range(len(t.cells[i])):
                        if t.cells[i][j].text.strip() == "":
                            if t.cells[i][j].hspan and not t.cells[i][j].left:
                                t.cells[i][j].text = t.cells[i][j - 1].text
            elif f == "v":
                for i in range(len(t.cells)):
                    for j in range(len(t.cells[i])):
                        if t.cells[i][j].text.strip() == "":
                            if t.cells[i][j].vspan and not t.cells[i][j].top:
                                t.cells[i][j].text = t.cells[i - 1][j].text
        return t
    def _generate_table_bbox(self):
        def scale_areas(areas):
            scaled_areas = []
@ -342,37 +308,7 @@ class Lattice(BaseParser):
        # set spanning cells to True
        table = table.set_span()
        pos_errors = []
        # TODO: have a single list in place of two directional ones?
        # sorted on x-coordinate based on reading order i.e. LTR or RTL
        for direction in ["vertical", "horizontal"]:
            for t in self.t_bbox[direction]:
                indices, error = get_table_index(
                    table,
                    t,
                    direction,
                    split_text=self.split_text,
                    flag_size=self.flag_size,
                    strip_text=self.strip_text,
                )
                if indices[:2] != (-1, -1):
                    pos_errors.append(error)
                    indices = Lattice._reduce_index(
                        table, indices, shift_text=self.shift_text
                    )
                    for r_idx, c_idx, text in indices:
                        table.cells[r_idx][c_idx].text = text
        # FRHTODO
        accuracy = compute_accuracy([[100, pos_errors]])
        if self.copy_text is not None:
            table = Lattice._copy_spanning_text(
                table,
                copy_text=self.copy_text
            )
        table.record_parse_metadata(self)
        table.accuracy = accuracy
        # for plotting
        _text = []
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@ -419,14 +419,8 @@ class Stream(BaseParser):
    def _generate_table(self, table_idx, cols, rows, **kwargs):
        table = self._initialize_new_table(table_idx, cols, rows)
        table = table.set_all_edges()
        pos_errors = self._compute_parse_errors(table)
        accuracy = compute_accuracy([[100, pos_errors]])
        table.record_parse_metadata(self)
        table.accuracy = accuracy
        # for plotting
        _text = []
        _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])