diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 8b0fc2b..727e00b 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -11,7 +11,7 @@ from .base import BaseParser from ..core import Table from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, merge_close_values, get_table_index, compute_accuracy, - count_empty, encode_, setup_logging) + count_empty_strings, encode_, setup_logging) from ..image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) @@ -194,7 +194,7 @@ class Lattice(BaseParser): table.df = pd.DataFrame(data) table.shape = table.df.shape - whitespace, __, __ = count_empty(data) + whitespace, __, __ = count_empty_strings(data) table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 37e9bcf..9d12384 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -8,7 +8,7 @@ import pandas as pd from .base import BaseParser from ..core import Table from ..utils import (text_in_bbox, get_table_index, compute_accuracy, - count_empty, encode_) + count_empty_strings, encode_) logger = setup_logging(__name__) @@ -207,7 +207,7 @@ class Stream(BaseParser): table.df = pd.DataFrame(data) table.shape = table.df.shape - whitespace, __, __ = count_empty(data) + whitespace, __, __ = count_empty_strings(data) table.accuracy = accuracy table.whitespace = whitespace table.order = table_idx + 1 diff --git a/camelot/utils.py b/camelot/utils.py index 7eaad84..c957a4e 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -287,7 +287,7 @@ def merge_close_values(ar, mtol=2): return ret -def flag_on_size(textline, direction): +def flag_font_size(textline, direction): """ Parameters @@ -381,7 +381,7 @@ def split_textline(table, textline, direction, flag_size=True): grouped_chars = [] for key, chars in groupby(cut_text, itemgetter(0, 1)): if flag_size: - grouped_chars.append((key[0], key[1], flag_on_size([t[2] for t in chars], direction))) + grouped_chars.append((key[0], key[1], flag_font_size([t[2] for t in chars], direction))) else: gchars = [t[2].get_text() for t in chars] grouped_chars.append((key[0], key[1], ''.join(gchars).strip('\n'))) @@ -444,7 +444,7 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): return split_textline(table, t, direction, flag_size=flag_size), error else: if flag_size: - return [(r_idx, c_idx, flag_on_size(t._objs, direction))], error + return [(r_idx, c_idx, flag_font_size(t._objs, direction))], error else: return [(r_idx, c_idx, t.get_text().strip('\n'))], error @@ -474,27 +474,7 @@ def compute_accuracy(error_weights): return score -def remove_empty(d): - """ - - Parameters - ---------- - d - - Returns - ------- - - """ - for i, row in enumerate(d): - if row == [''] * len(row): - d.pop(i) - d = zip(*d) - d = [list(row) for row in d if any(row)] - d = zip(*d) - return d - - -def count_empty(d): +def count_empty_strings(d): """ Parameters @@ -529,6 +509,26 @@ def count_empty(d): return empty_p, r_nempty_cells, c_nempty_cells +def remove_empty_strings(d): + """ + + Parameters + ---------- + d + + Returns + ------- + + """ + for i, row in enumerate(d): + if row == [''] * len(row): + d.pop(i) + d = zip(*d) + d = [list(row) for row in d if any(row)] + d = zip(*d) + return d + + def encode_(ar): """