diff --git a/camelot/core.py b/camelot/core.py index 1ce71ab..8f7cbaf 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -413,7 +413,7 @@ class Table(object): } return report - def record_metadata(self, parser): + def record_parse_metadata(self, parser): """Record data about the origin of the table """ self.flavor = parser.id diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index bd41fc3..c50a164 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -3,7 +3,8 @@ import os from ..utils import ( - get_text_objects + get_text_objects, + get_table_index ) from ..core import Table @@ -11,8 +12,26 @@ from ..core import Table class BaseParser(object): """Defines a base parser. """ - def __init__(self, parser_id): + def __init__(self, + parser_id, + table_regions=None, + table_areas=None, + split_text=False, + strip_text="", + shift_text=None, + flag_size=False, + ): self.id = parser_id + self.table_regions = table_regions + self.table_areas = table_areas + + self.split_text = split_text + self.strip_text = strip_text + self.shift_text = shift_text + + self.flag_size = flag_size + + self.t_bbox = None # For plotting details of parsing algorithms self.debug_info = {} @@ -57,3 +76,38 @@ class BaseParser(object): table.page = self.page table.order = table_idx + 1 return table + + + @staticmethod + def _reduce_index(t, idx, shift_text): + """Reduces index of a text object if it lies within a spanning + cell. Only useful for some parsers (e.g. Lattice), base method is a + noop. + """ + return idx + + def _compute_parse_errors(self, table): + pos_errors = [] + # TODO: have a single list in place of two directional ones? + # sorted on x-coordinate based on reading order i.e. LTR or RTL + for direction in ["vertical", "horizontal"]: + for t in self.t_bbox[direction]: + indices, error = get_table_index( + table, + t, + direction, + split_text=self.split_text, + flag_size=self.flag_size, + strip_text=self.strip_text, + ) + if indices[:2] != (-1, -1): + pos_errors.append(error) + indices = type(self)._reduce_index( + table, + indices, + shift_text=self.shift_text + ) + for r_idx, c_idx, text in indices: + table.cells[r_idx][c_idx].text = text + return pos_errors + diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 5bb130b..c294b55 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -114,16 +114,18 @@ class Lattice(BaseParser): resolution=300, **kwargs ): - super().__init__("lattice") - self.table_regions = table_regions - self.table_areas = table_areas + super().__init__( + "lattice", + table_regions=table_regions, + table_areas=table_areas, + split_text=split_text, + strip_text=strip_text, + shift_text=shift_text or ["l", "t"], + flag_size=flag_size, + ) self.process_background = process_background self.line_scale = line_scale self.copy_text = copy_text - self.shift_text = shift_text or ["l", "t"] - self.split_text = split_text - self.flag_size = flag_size - self.strip_text = strip_text self.line_tol = line_tol self.joint_tol = joint_tol self.threshold_blocksize = threshold_blocksize @@ -178,6 +180,7 @@ class Lattice(BaseParser): indices.append((r_idx, c_idx, text)) return indices + @staticmethod def _copy_spanning_text(t, copy_text=None): """Copies over text in empty spanning cells. @@ -368,7 +371,7 @@ class Lattice(BaseParser): copy_text=self.copy_text ) - table.record_metadata(self) + table.record_parse_metadata(self) table.accuracy = accuracy # for plotting diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 554e2f8..0d507c5 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -10,7 +10,7 @@ import pandas as pd from .base import BaseParser from ..core import TextEdges -from ..utils import (text_in_bbox, get_table_index, compute_accuracy, +from ..utils import (text_in_bbox, compute_accuracy, compute_whitespace) @@ -61,22 +61,24 @@ class Stream(BaseParser): table_regions=None, table_areas=None, columns=None, - split_text=False, flag_size=False, + split_text=False, strip_text="", edge_tol=50, row_tol=2, column_tol=0, **kwargs ): - super().__init__("stream") - self.table_regions = table_regions - self.table_areas = table_areas + super().__init__( + "stream", + table_regions=table_regions, + table_areas=table_areas, + split_text=split_text, + strip_text=strip_text, + flag_size=flag_size, + ) self.columns = columns self._validate_columns() - self.split_text = split_text - self.flag_size = flag_size - self.strip_text = strip_text self.edge_tol = edge_tol self.row_tol = row_tol self.column_tol = column_tol @@ -418,26 +420,10 @@ class Stream(BaseParser): table = self._initialize_new_table(table_idx, cols, rows) table = table.set_all_edges() - pos_errors = [] - # TODO: have a single list in place of two directional ones? - # sorted on x-coordinate based on reading order i.e. LTR or RTL - for direction in ["vertical", "horizontal"]: - for t in self.t_bbox[direction]: - indices, error = get_table_index( - table, - t, - direction, - split_text=self.split_text, - flag_size=self.flag_size, - strip_text=self.strip_text, - ) - if indices[:2] != (-1, -1): - pos_errors.append(error) - for r_idx, c_idx, text in indices: - table.cells[r_idx][c_idx].text = text + pos_errors = self._compute_parse_errors(table) accuracy = compute_accuracy([[100, pos_errors]]) - table.record_metadata(self) + table.record_parse_metadata(self) table.accuracy = accuracy diff --git a/camelot/utils.py b/camelot/utils.py index 89b6eee..cc4a58c 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1018,7 +1018,11 @@ def export_pdf_as_png(pdf_path, destination_path): pdf_path : str destination_path : str """ - gs_call = f"-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}" + gs_call = "-q -sDEVICE=png16m -o {destination_path} -r300 {pdf_path}"\ + .format( + destination_path=destination_path, + pdf_path=pdf_path + ) gs_call = gs_call.encode().split() null = open(os.devnull, "wb") Ghostscript(*gs_call, stdout=null) @@ -1038,19 +1042,28 @@ def compare_tables(left, right): differences = [] if (diff_rows): differences.append( - f"{abs(diff_rows)} " - f"{'more' if diff_rows>0 else 'fewer'} rows" + "{diff_rows} {more_fewer} rows".format( + diff_rows=abs(diff_rows), + more_fewer='more' if diff_rows>0 else 'fewer' + ) ) if (diff_cols): differences.append( - f"{abs(diff_cols)} " - f"{'more' if diff_cols>0 else 'fewer'} columns" + "{diff_cols} {more_fewer} columns".format( + diff_cols=abs(diff_cols), + more_fewer='more' if diff_cols>0 else 'fewer' + ) ) if differences: differences_str = " and ".join(differences) - print(f"Right has {differences_str} than left " - f"[{right.shape[0]},{right.shape[1]}] vs " - f"[{left.shape[0]},{left.shape[1]}]") + print( + "Right has {differences_str} than left " + "{shape_right} vs {shape_left}".format( + differences_str=differences_str, + shape_right=[right.shape[0], right.shape[1]], + shape_left=[left.shape[0], left.shape[1]] + ) + ) table1, table2 = [left, right] name_table1, name_table2 = ["left", "right"] @@ -1070,8 +1083,11 @@ def compare_tables(left, right): diff_df[name_table2] = lcol diff_df["Match"] = lcol == scol print( - f"Column {i} different:\n" - f"{diff_df}" + "Column {i} different:\n" + "{diff_df}".format( + i=i, + diff_df=diff_df + ) ) break else: