From cff7a9698b6462d140979d722fe738a0f16dbe3a Mon Sep 17 00:00:00 2001 From: Frh Date: Sun, 19 Apr 2020 13:28:17 -0700 Subject: [PATCH] Further refactor Move common parse error stats computation to base parser Move copy_spanning_text logic to the table --- camelot/core.py | 37 +++++++++++++++++++++ camelot/parsers/base.py | 4 ++- camelot/parsers/lattice.py | 66 +------------------------------------- camelot/parsers/stream.py | 6 ---- 4 files changed, 41 insertions(+), 72 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 8f7cbaf..94d49e9 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -419,6 +419,12 @@ class Table(object): self.flavor = parser.id self.filename = parser.filename self.debug_info = parser.debug_info + pos_errors = parser.compute_parse_errors(self) + self.accuracy = compute_accuracy([[100, pos_errors]]) + + if parser.copy_text is not None: + self.copy_spanning_text(parser.copy_text) + data = self.data self.df = pd.DataFrame(data) self.shape = self.df.shape @@ -712,6 +718,37 @@ class Table(object): conn.commit() conn.close() + def copy_spanning_text(self, copy_text=None): + """Copies over text in empty spanning cells. + + Parameters + ---------- + copy_text : list, optional (default: None) + {'h', 'v'} + Select one or more strings from above and pass them as a list + to specify the direction in which text should be copied over + when a cell spans multiple rows or columns. + + Returns + ------- + t : camelot.core.Table + + """ + for f in copy_text: + if f == "h": + for i in range(len(self.cells)): + for j in range(len(self.cells[i])): + if self.cells[i][j].text.strip() == "": + if self.cells[i][j].hspan and not self.cells[i][j].left: + self.cells[i][j].text = self.cells[i][j - 1].text + elif f == "v": + for i in range(len(self.cells)): + for j in range(len(self.cells[i])): + if self.cells[i][j].text.strip() == "": + if self.cells[i][j].vspan and not self.cells[i][j].top: + self.cells[i][j].text = self.cells[i - 1][j].text + return self + class TableList(object): """Defines a list of camelot.core.Table objects. Each table can diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index c50a164..19deceb 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -16,6 +16,7 @@ class BaseParser(object): parser_id, table_regions=None, table_areas=None, + copy_text=None, split_text=False, strip_text="", shift_text=None, @@ -25,6 +26,7 @@ class BaseParser(object): self.table_regions = table_regions self.table_areas = table_areas + self.copy_text = copy_text self.split_text = split_text self.strip_text = strip_text self.shift_text = shift_text @@ -86,7 +88,7 @@ class BaseParser(object): """ return idx - def _compute_parse_errors(self, table): + def compute_parse_errors(self, table): pos_errors = [] # TODO: have a single list in place of two directional ones? # sorted on x-coordinate based on reading order i.e. LTR or RTL diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index c294b55..c0f3e9b 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -120,12 +120,12 @@ class Lattice(BaseParser): table_areas=table_areas, split_text=split_text, strip_text=strip_text, + copy_text=copy_text, shift_text=shift_text or ["l", "t"], flag_size=flag_size, ) self.process_background = process_background self.line_scale = line_scale - self.copy_text = copy_text self.line_tol = line_tol self.joint_tol = joint_tol self.threshold_blocksize = threshold_blocksize @@ -180,40 +180,6 @@ class Lattice(BaseParser): indices.append((r_idx, c_idx, text)) return indices - - @staticmethod - def _copy_spanning_text(t, copy_text=None): - """Copies over text in empty spanning cells. - - Parameters - ---------- - t : camelot.core.Table - copy_text : list, optional (default: None) - {'h', 'v'} - Select one or more strings from above and pass them as a list - to specify the direction in which text should be copied over - when a cell spans multiple rows or columns. - - Returns - ------- - t : camelot.core.Table - - """ - for f in copy_text: - if f == "h": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].text.strip() == "": - if t.cells[i][j].hspan and not t.cells[i][j].left: - t.cells[i][j].text = t.cells[i][j - 1].text - elif f == "v": - for i in range(len(t.cells)): - for j in range(len(t.cells[i])): - if t.cells[i][j].text.strip() == "": - if t.cells[i][j].vspan and not t.cells[i][j].top: - t.cells[i][j].text = t.cells[i - 1][j].text - return t - def _generate_table_bbox(self): def scale_areas(areas): scaled_areas = [] @@ -342,37 +308,7 @@ class Lattice(BaseParser): # set spanning cells to True table = table.set_span() - pos_errors = [] - # TODO: have a single list in place of two directional ones? - # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ["vertical", "horizontal"]: - for t in self.t_bbox[direction]: - indices, error = get_table_index( - table, - t, - direction, - split_text=self.split_text, - flag_size=self.flag_size, - strip_text=self.strip_text, - ) - if indices[:2] != (-1, -1): - pos_errors.append(error) - indices = Lattice._reduce_index( - table, indices, shift_text=self.shift_text - ) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].text = text - # FRHTODO - accuracy = compute_accuracy([[100, pos_errors]]) - - if self.copy_text is not None: - table = Lattice._copy_spanning_text( - table, - copy_text=self.copy_text - ) - table.record_parse_metadata(self) - table.accuracy = accuracy # for plotting _text = [] diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 0d507c5..2df3093 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -419,14 +419,8 @@ class Stream(BaseParser): def _generate_table(self, table_idx, cols, rows, **kwargs): table = self._initialize_new_table(table_idx, cols, rows) table = table.set_all_edges() - - pos_errors = self._compute_parse_errors(table) - accuracy = compute_accuracy([[100, pos_errors]]) - table.record_parse_metadata(self) - table.accuracy = accuracy - # for plotting _text = [] _text.extend([(t.x0, t.y0, t.x1, t.y1) for t in self.horizontal_text])