From 04383920b41bbdf461a4868df8054ebb477ab667 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sat, 8 Sep 2018 05:38:43 +0530 Subject: [PATCH] Rename parser keyword arguments --- camelot/core.py | 16 +++++----- camelot/image_processing.py | 26 ++++++++-------- camelot/parsers/lattice.py | 59 ++++++++++++++++++++----------------- camelot/parsers/stream.py | 36 +++++++++++----------- camelot/utils.py | 12 ++++---- 5 files changed, 77 insertions(+), 72 deletions(-) diff --git a/camelot/core.py b/camelot/core.py index 015c533..e3f9bb3 100644 --- a/camelot/core.py +++ b/camelot/core.py @@ -139,14 +139,14 @@ class Table(object): cell.left = cell.right = cell.top = cell.bottom = True return self - def set_edges(self, vertical, horizontal, jtol=2): + def set_edges(self, vertical, horizontal, joint_close_tol=2): """ Parameters ---------- vertical horizontal - jtol + joint_close_tol Returns ------- @@ -156,11 +156,11 @@ class Table(object): # find closest x coord # iterate over y coords and find closest start and end points i = [i for i, t in enumerate(self.cols) - if np.isclose(v[0], t[0], atol=jtol)] + if np.isclose(v[0], t[0], atol=joint_close_tol)] j = [j for j, t in enumerate(self.rows) - if np.isclose(v[3], t[0], atol=jtol)] + if np.isclose(v[3], t[0], atol=joint_close_tol)] k = [k for k, t in enumerate(self.rows) - if np.isclose(v[1], t[0], atol=jtol)] + if np.isclose(v[1], t[0], atol=joint_close_tol)] if not j: continue J = j[0] @@ -207,11 +207,11 @@ class Table(object): # find closest y coord # iterate over x coords and find closest start and end points i = [i for i, t in enumerate(self.rows) - if np.isclose(h[1], t[0], atol=jtol)] + if np.isclose(h[1], t[0], atol=joint_close_tol)] j = [j for j, t in enumerate(self.cols) - if np.isclose(h[0], t[0], atol=jtol)] + if np.isclose(h[0], t[0], atol=joint_close_tol)] k = [k for k, t in enumerate(self.cols) - if np.isclose(h[2], t[0], atol=jtol)] + if np.isclose(h[2], t[0], atol=joint_close_tol)] if not j: continue J = j[0] diff --git a/camelot/image_processing.py b/camelot/image_processing.py index a1526ef..bdd82fb 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -7,13 +7,13 @@ import numpy as np from .utils import merge_tuples -def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): +def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2): """ Parameters ---------- imagename - invert + process_background blocksize c @@ -24,7 +24,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): img = cv2.imread(imagename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - if invert: + if process_background: threshold = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c) else: @@ -33,14 +33,14 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): return img, threshold -def find_lines(threshold, direction='horizontal', scale=15, iterations=0): +def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0): """ Parameters ---------- threshold direction - scale + line_size_scaling iterations Returns @@ -50,10 +50,10 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0): lines = [] if direction == 'vertical': - size = threshold.shape[0] // scale + size = threshold.shape[0] // line_size_scaling el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) elif direction == 'horizontal': - size = threshold.shape[1] // scale + size = threshold.shape[1] // line_size_scaling el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) elif direction is None: raise ValueError("Specify direction as either 'vertical' or" @@ -148,19 +148,19 @@ def find_table_joints(contours, vertical, horizontal): return tables -def remove_lines(threshold, line_scale=15): +def remove_lines(threshold, line_size_scaling=15): """ Parameters ---------- threshold - line_scale + line_size_scaling Returns ------- """ - size = threshold.shape[0] // line_scale + size = threshold.shape[0] // line_size_scaling vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) @@ -176,19 +176,19 @@ def remove_lines(threshold, line_scale=15): return threshold -def find_cuts(threshold, char_scale=200): +def find_cuts(threshold, char_size_scaling=200): """ Parameters ---------- threshold - char_scale + char_size_scaling Returns ------- """ - size = threshold.shape[0] // char_scale + size = threshold.shape[0] // char_size_scaling char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) threshold = cv2.erode(threshold, char_el) diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index bf4fca3..ba79230 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -10,7 +10,7 @@ import pandas as pd from .base import BaseParser from ..core import Table from ..utils import (scale_image, scale_pdf, segments_in_bbox, text_in_bbox, - merge_close_values, get_table_index, compute_accuracy, + merge_close_lines, get_table_index, compute_accuracy, count_empty_strings, encode_, setup_logging) from ..image_processing import (adaptive_threshold, find_lines, find_table_contours, find_table_joints) @@ -23,23 +23,24 @@ class Lattice(BaseParser): """ """ - def __init__(self, table_area=None, fill=None, mtol=2, jtol=2, - blocksize=15, threshold_constant=-2, scale=15, iterations=0, - invert=False, margins=(1.0, 0.5, 0.1), split_text=False, - flag_size=True, shift_text=['l', 't'], debug=None): + def __init__(self, table_area=None, process_background=False, + line_size_scaling=15, copy_text=None, shift_text=['l', 't'], + split_text=False, flag_size=True, line_close_tol=2, + joint_close_tol=2, blocksize=15, threshold_constant=-2, + iterations=0, margins=(1.0, 0.5, 0.1), debug=None): self.table_area = table_area - self.fill = fill - self.mtol = mtol - self.jtol = jtol - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.scale = scale - self.iterations = iterations - self.invert = invert - self.char_margin, self.line_margin, self.word_margin = margins + self.process_background = process_background + self.line_size_scaling = line_size_scaling + self.copy_text = copy_text + self.shift_text = shift_text self.split_text = split_text self.flag_size = flag_size - self.shift_text = shift_text + self.line_close_tol = line_close_tol + self.joint_close_tol = joint_close_tol + self.blocksize = blocksize + self.threshold_constant = threshold_constant + self.iterations = iterations + self.char_margin, self.line_margin, self.word_margin = margins self.debug = debug @staticmethod @@ -67,8 +68,8 @@ class Lattice(BaseParser): return indices @staticmethod - def _fill_spanning(t, fill=None): - for f in fill: + def _copy_spanning_text(t, copy_text=None): + for f in copy_text: if f == "h": for i in range(len(t.cells)): for j in range(len(t.cells[i])): @@ -96,7 +97,7 @@ class Lattice(BaseParser): stderr=subprocess.STDOUT) def _generate_table_bbox(self): - self.image, self.threshold = adaptive_threshold(self.imagename, invert=self.invert, + self.image, self.threshold = adaptive_threshold(self.imagename, process_background=self.process_background, blocksize=self.blocksize, c=self.threshold_constant) image_width = self.image.shape[1] image_height = self.image.shape[0] @@ -107,10 +108,12 @@ class Lattice(BaseParser): image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height) pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height) - vertical_mask, vertical_segments = find_lines(self.threshold, - direction='vertical', scale=self.scale, iterations=self.iterations) - horizontal_mask, horizontal_segments = find_lines(self.threshold, - direction='horizontal', scale=self.scale, iterations=self.iterations) + vertical_mask, vertical_segments = find_lines( + self.threshold, direction='vertical', + line_size_scaling=self.line_size_scaling, iterations=self.iterations) + horizontal_mask, horizontal_segments = find_lines( + self.threshold, direction='horizontal', + line_size_scaling=self.line_size_scaling, iterations=self.iterations) if self.table_area is not None: areas = [] @@ -149,8 +152,10 @@ class Lattice(BaseParser): cols.extend([tk[0], tk[2]]) rows.extend([tk[1], tk[3]]) # sort horizontal and vertical segments - cols = merge_close_values(sorted(cols), mtol=self.mtol) - rows = merge_close_values(sorted(rows, reverse=True), mtol=self.mtol) + cols = merge_close_lines( + sorted(cols), line_close_tol=self.line_close_tol) + rows = merge_close_lines( + sorted(rows, reverse=True), line_close_tol=self.line_close_tol) # make grid using x and y coord of shortlisted rows and cols cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] @@ -167,7 +172,7 @@ class Lattice(BaseParser): table = Table(cols, rows) # set table edges to True using ver+hor lines - table = table.set_edges(v_s, h_s, jtol=self.jtol) + table = table.set_edges(v_s, h_s, joint_close_tol=self.joint_close_tol) # set spanning cells to True table = table.set_span() # set table border edges to True @@ -186,8 +191,8 @@ class Lattice(BaseParser): table.cells[r_idx][c_idx].text = text accuracy = compute_accuracy([[100, pos_errors]]) - if self.fill is not None: - table = Lattice._fill_spanning(table, fill=self.fill) + if self.copy_text is not None: + table = Lattice._copy_spanning_text(table, copy_text=self.copy_text) data = table.data data = encode_(data) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 1976505..12f4b6b 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -18,17 +18,17 @@ class Stream(BaseParser): """ """ - def __init__(self, table_area=None, columns=None, ytol=2, mtol=0, - margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, - debug=None): + def __init__(self, table_area=None, columns=None, split_text=False, + flag_size=True, row_close_tol=2, col_close_tol=0, + margins=(1.0, 0.5, 0.1), debug=None): self.table_area = table_area self.columns = columns self._validate_columns() - self.ytol = ytol - self.mtol = mtol - self.char_margin, self.line_margin, self.word_margin = margins self.split_text = split_text self.flag_size = flag_size + self.row_close_tol = row_close_tol + self.col_close_tol = col_close_tol + self.char_margin, self.line_margin, self.word_margin = margins self.debug = debug @staticmethod @@ -41,7 +41,7 @@ class Stream(BaseParser): return text_bbox @staticmethod - def _group_rows(text, ytol=2): + def _group_rows(text, row_close_tol=2): row_y = 0 rows = [] temp = [] @@ -50,7 +50,7 @@ class Stream(BaseParser): # if t.get_text().strip() and all([obj.upright for obj in t._objs if # type(obj) is LTChar]): if t.get_text().strip(): - if not np.isclose(row_y, t.y0, atol=ytol): + if not np.isclose(row_y, t.y0, atol=row_close_tol): rows.append(sorted(temp, key=lambda t: t.x0)) temp = [] row_y = t.y0 @@ -60,24 +60,24 @@ class Stream(BaseParser): return rows @staticmethod - def _merge_columns(l, mtol=0): + def _merge_columns(l, col_close_tol=0): merged = [] for higher in l: if not merged: merged.append(higher) else: lower = merged[-1] - if mtol >= 0: + if col_close_tol >= 0: if (higher[0] <= lower[1] or - np.isclose(higher[0], lower[1], atol=mtol)): + np.isclose(higher[0], lower[1], atol=col_close_tol)): upper_bound = max(lower[1], higher[1]) lower_bound = min(lower[0], higher[0]) merged[-1] = (lower_bound, upper_bound) else: merged.append(higher) - elif mtol < 0: + elif col_close_tol < 0: if higher[0] <= lower[1]: - if np.isclose(higher[0], lower[1], atol=abs(mtol)): + if np.isclose(higher[0], lower[1], atol=abs(col_close_tol)): merged.append(higher) else: upper_bound = max(lower[1], higher[1]) @@ -99,9 +99,9 @@ class Stream(BaseParser): return rows @staticmethod - def _add_columns(cols, text, ytol): + def _add_columns(cols, text, row_close_tol): if text: - text = Stream._group_rows(text, ytol=ytol) + text = Stream._group_rows(text, row_close_tol=row_close_tol) elements = [len(r) for r in text] new_cols = [(t.x0, t.x1) for r in text if len(r) == max(elements) for t in r] @@ -149,7 +149,7 @@ class Stream(BaseParser): self.t_bbox[direction].sort(key=lambda x: (-x.y0, x.x0)) text_x_min, text_y_min, text_x_max, text_y_max = self._text_bbox(self.t_bbox) - rows_grouped = self._group_rows(self.t_bbox['horizontal'], ytol=self.ytol) + rows_grouped = self._group_rows(self.t_bbox['horizontal'], row_close_tol=self.row_close_tol) rows = self._join_rows(rows_grouped, text_y_max, text_y_min) elements = [len(r) for r in rows_grouped] @@ -170,7 +170,7 @@ class Stream(BaseParser): os.path.basename(self.rootname))) cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r] - cols = self._merge_columns(sorted(cols), mtol=self.mtol) + cols = self._merge_columns(sorted(cols), col_close_tol=self.col_close_tol) inner_text = [] for i in range(1, len(cols)): left = cols[i - 1][1] @@ -182,7 +182,7 @@ class Stream(BaseParser): for t in self.t_bbox[direction] if t.x0 > cols[-1][1] or t.x1 < cols[0][0]] inner_text.extend(outer_text) - cols = self._add_columns(cols, inner_text, self.ytol) + cols = self._add_columns(cols, inner_text, self.row_close_tol) cols = self._join_columns(cols, text_x_min, text_x_max) return cols, rows diff --git a/camelot/utils.py b/camelot/utils.py index c957a4e..d132b5a 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -236,13 +236,13 @@ def text_in_bbox(bbox, text): return t_bbox -def remove_close_values(ar, mtol=2): +def remove_close_lines(ar, line_close_tol=2): """ Parameters ---------- ar - mtol + line_close_tol Returns ------- @@ -254,20 +254,20 @@ def remove_close_values(ar, mtol=2): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=mtol): + if np.isclose(temp, a, atol=line_close_tol): pass else: ret.append(a) return ret -def merge_close_values(ar, mtol=2): +def merge_close_lines(ar, line_close_tol=2): """ Parameters ---------- ar - mtol + line_close_tol Returns ------- @@ -279,7 +279,7 @@ def merge_close_values(ar, mtol=2): ret.append(a) else: temp = ret[-1] - if np.isclose(temp, a, atol=mtol): + if np.isclose(temp, a, atol=line_close_tol): temp = (temp + a) / 2.0 ret[-1] = temp else: