From 18581640be206161bac893c25d64e2069a81924c Mon Sep 17 00:00:00 2001 From: Frh Date: Fri, 24 Apr 2020 15:54:58 -0700 Subject: [PATCH] Common parent TextBaseParser for Stream and Hybrid --- camelot/core.py | 2 +- camelot/parsers/base.py | 59 ++++++++++++++++++++++++++++++++++++++ camelot/parsers/hybrid.py | 46 +++++++---------------------- camelot/parsers/lattice.py | 31 ++++---------------- camelot/parsers/stream.py | 41 +++++++------------------- 5 files changed, 87 insertions(+), 92 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 440b2c9..9921b95 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -136,7 +136,7 @@ class TextEdge(TextAlignment): class TextAlignments(object): - """Defines a dict of text edges accross alignment references. + """Defines a dict of text edges across reference alignments. """ def __init__(self, alignment_names): diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 9e76c7b..e4b5071 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -169,3 +169,62 @@ class BaseParser(object): for r_idx, c_idx, text in indices: table.cells[r_idx][c_idx].text = text return pos_errors + + def extract_tables(self): + if self._document_has_no_text(): + return [] + + # Identify plausible areas within the doc where tables lie, + # populate table_bbox keys with these areas. + self._generate_table_bbox() + + _tables = [] + # sort tables based on y-coord + for table_idx, bbox in enumerate( + sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) + ): + cols, rows, v_s, h_s = self._generate_columns_and_rows( + bbox, + table_idx + ) + table = self._generate_table( + table_idx, cols, rows, v_s=v_s, h_s=h_s) + table._bbox = bbox + _tables.append(table) + + return _tables + + +class TextBaseParser(BaseParser): + """Base class for all text parsers. + """ + + def __init__( + self, + parser_id, + table_regions=None, + table_areas=None, + columns=None, + flag_size=False, + split_text=False, + strip_text="", + edge_tol=50, + row_tol=2, + column_tol=0, + **kwargs + ): + super().__init__( + "stream", + table_regions=table_regions, + table_areas=table_areas, + split_text=split_text, + strip_text=strip_text, + flag_size=flag_size, + ) + self.columns = columns + self._validate_columns() + self.edge_tol = edge_tol + self.row_tol = row_tol + self.column_tol = column_tol + + self.textedges = None diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py index 869e2d2..898cfc0 100644 --- a/camelot/parsers/hybrid.py +++ b/camelot/parsers/hybrid.py @@ -7,7 +7,7 @@ import numpy as np import copy import warnings -from .base import BaseParser +from .base import TextBaseParser from ..core import ( TextAlignments, ALL_ALIGNMENTS, @@ -257,7 +257,7 @@ class TextNetworks(TextAlignments): for align_id in self._textedges: edge_array = self._textedges[align_id] gaps = [] - vertical = align_id in ["left", "right", "middle"] + vertical = align_id in HORIZONTAL_ALIGNMENTS sort_function = (lambda tl: tl.y0) \ if vertical \ else (lambda tl: tl.x0) @@ -491,7 +491,7 @@ class TextNetworks(TextAlignments): ) -class Hybrid(BaseParser): +class Hybrid(TextBaseParser): """Hybrid method of parsing looks for spaces between text to parse the table. @@ -548,18 +548,14 @@ class Hybrid(BaseParser): "hybrid", table_regions=table_regions, table_areas=table_areas, + columns=columns, + flag_size=flag_size, split_text=split_text, strip_text=strip_text, - flag_size=flag_size, - debug=debug + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, ) - self.columns = columns - self.textedges = None - - self._validate_columns() - self.edge_tol = edge_tol - self.row_tol = row_tol - self.column_tol = column_tol # FRHTODO: Check if needed, refactor with Stream @staticmethod @@ -832,10 +828,10 @@ class Hybrid(BaseParser): )) # FRHTODO: Check is needed, refactor with Stream - def _generate_columns_and_rows(self, table_idx, tk): + def _generate_columns_and_rows(self, bbox, table_idx): # select elements which lie within table_bbox self.t_bbox = text_in_bbox_per_axis( - tk, + bbox, self.horizontal_text, self.vertical_text ) @@ -908,7 +904,7 @@ class Hybrid(BaseParser): cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._join_columns(cols, text_x_min, text_x_max) - return cols, rows + return cols, rows, None, None # FRHTODO: Check is needed, refactor with Stream def _generate_table(self, table_idx, cols, rows, **kwargs): @@ -922,23 +918,3 @@ class Hybrid(BaseParser): table._textedges = self.textedges return table - - def extract_tables(self): - if self._document_has_no_text(): - return [] - - # Identify plausible areas within the doc where tables lie, - # populate table_bbox keys with these areas. - self._generate_table_bbox() - - _tables = [] - # sort tables based on y-coord - for table_idx, bbox in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): - cols, rows = self._generate_columns_and_rows(table_idx, bbox) - table = self._generate_table(table_idx, cols, rows) - table._bbox = bbox - _tables.append(table) - - return _tables diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 2e62846..d6ba65d 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -252,21 +252,21 @@ class Lattice(BaseParser): table_bbox, vertical_segments, horizontal_segments, pdf_scalers ) - def _generate_columns_and_rows(self, tk): + def _generate_columns_and_rows(self, bbox, table_idx): # select elements which lie within table_bbox v_s, h_s = segments_in_bbox( - tk, self.vertical_segments, self.horizontal_segments + bbox, self.vertical_segments, self.horizontal_segments ) self.t_bbox = text_in_bbox_per_axis( - tk, + bbox, self.horizontal_text, self.vertical_text ) - cols, rows = zip(*self.table_bbox[tk]) + cols, rows = zip(*self.table_bbox[bbox]) cols, rows = list(cols), list(rows) - cols.extend([tk[0], tk[2]]) - rows.extend([tk[1], tk[3]]) + cols.extend([bbox[0], bbox[2]]) + rows.extend([bbox[1], bbox[3]]) # sort horizontal and vertical segments cols = merge_close_lines(sorted(cols), line_tol=self.line_tol) rows = merge_close_lines( @@ -302,22 +302,3 @@ class Lattice(BaseParser): table._textedges = None return table - - def extract_tables(self): - if self._document_has_no_text(): - return [] - - self._generate_table_bbox() - - _tables = [] - # sort tables based on y-coord - for table_idx, tk in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): - cols, rows, v_s, h_s = self._generate_columns_and_rows(tk) - table = self._generate_table( - table_idx, cols, rows, v_s=v_s, h_s=h_s) - table._bbox = tk - _tables.append(table) - - return _tables diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 08aaa41..91e2fde 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -5,7 +5,7 @@ import warnings import numpy as np -from .base import BaseParser +from .base import TextBaseParser from ..core import TextEdges from ..utils import ( bbox_from_str, @@ -15,7 +15,7 @@ from ..utils import ( ) -class Stream(BaseParser): +class Stream(TextBaseParser): """Stream method of parsing looks for spaces between text to parse the table. @@ -71,15 +71,14 @@ class Stream(BaseParser): "stream", table_regions=table_regions, table_areas=table_areas, + columns=columns, + flag_size=flag_size, split_text=split_text, strip_text=strip_text, - flag_size=flag_size, + edge_tol=edge_tol, + row_tol=row_tol, + column_tol=column_tol, ) - self.columns = columns - self._validate_columns() - self.edge_tol = edge_tol - self.row_tol = row_tol - self.column_tol = column_tol @staticmethod def _group_rows(text, row_tol=2): @@ -302,10 +301,10 @@ class Stream(BaseParser): table_bbox[bbox_from_str(area_str)] = None self.table_bbox = table_bbox - def _generate_columns_and_rows(self, table_idx, tk): + def _generate_columns_and_rows(self, bbox, table_idx): # select elements which lie within table_bbox self.t_bbox = text_in_bbox_per_axis( - tk, + bbox, self.horizontal_text, self.vertical_text ) @@ -378,7 +377,7 @@ class Stream(BaseParser): cols = self._add_columns(cols, inner_text, self.row_tol) cols = self._join_columns(cols, text_x_min, text_x_max) - return cols, rows + return cols, rows, None, None def _generate_table(self, table_idx, cols, rows, **kwargs): table = self._initialize_new_table(table_idx, cols, rows) @@ -391,23 +390,3 @@ class Stream(BaseParser): table._textedges = self.textedges return table - - def extract_tables(self): - if self._document_has_no_text(): - return [] - - # Identify plausible areas within the doc where tables lie, - # populate table_bbox keys with these areas. - self._generate_table_bbox() - - _tables = [] - # sort tables based on y-coord - for table_idx, bbox in enumerate( - sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True) - ): - cols, rows = self._generate_columns_and_rows(table_idx, bbox) - table = self._generate_table(table_idx, cols, rows) - table._bbox = bbox - _tables.append(table) - - return _tables