From bf63432494fd604f98eaf7c7320e4163212d1adf Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 5 Sep 2018 19:04:40 +0530 Subject: [PATCH] Remove docstrings --- camelot/image_processing.py | 135 ------------- camelot/utils.py | 382 ------------------------------------ 2 files changed, 517 deletions(-) diff --git a/camelot/image_processing.py b/camelot/image_processing.py index 1621bea..fc284e4 100644 --- a/camelot/image_processing.py +++ b/camelot/image_processing.py @@ -8,35 +8,6 @@ from .utils import merge_tuples def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): - """Thresholds an image using OpenCV's adaptiveThreshold. - - Parameters - ---------- - imagename : string - Path to image file. - - invert : bool - Whether or not to invert the image. Useful when pdfs have - tables with lines in background. - (optional, default: False) - - blocksize: int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - - c: float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - - Returns - ------- - img : object - numpy.ndarray representing the original image. - - threshold : object - numpy.ndarray representing the thresholded image. - """ img = cv2.imread(imagename) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) @@ -50,38 +21,6 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): def find_lines(threshold, direction='horizontal', scale=15, iterations=0): - """Finds horizontal and vertical lines by applying morphological - transformations on an image. - - Parameters - ---------- - threshold : object - numpy.ndarray representing the thresholded image. - - direction : string - Specifies whether to find vertical or horizontal lines. - (default: 'horizontal') - - scale : int - Used to divide the height/width to get a structuring element - for morph transform. - (optional, default: 15) - - iterations : int - Number of iterations for dilation. - (optional, default: 2) - - Returns - ------- - dmask : object - numpy.ndarray representing pixels where vertical/horizontal - lines lie. - - lines : list - List of tuples representing vertical/horizontal lines with - coordinates relative to a left-top origin in - OpenCV's coordinate space. - """ lines = [] if direction == 'vertical': @@ -118,23 +57,6 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0): def find_table_contours(vertical, horizontal): - """Finds table boundaries using OpenCV's findContours. - - Parameters - ---------- - vertical : object - numpy.ndarray representing pixels where vertical lines lie. - - horizontal : object - numpy.ndarray representing pixels where horizontal lines lie. - - Returns - ------- - cont : list - List of tuples representing table boundaries. Each tuple is of - the form (x, y, w, h) where (x, y) -> left-top, w -> width and - h -> height in OpenCV's coordinate space. - """ mask = vertical + horizontal try: @@ -154,30 +76,6 @@ def find_table_contours(vertical, horizontal): def find_table_joints(contours, vertical, horizontal): - """Finds joints/intersections present inside each table boundary. - - Parameters - ---------- - contours : list - List of tuples representing table boundaries. Each tuple is of - the form (x, y, w, h) where (x, y) -> left-top, w -> width and - h -> height in OpenCV's coordinate space. - - vertical : object - numpy.ndarray representing pixels where vertical lines lie. - - horizontal : object - numpy.ndarray representing pixels where horizontal lines lie. - - Returns - ------- - tables : dict - Dict with table boundaries as keys and list of intersections - in that boundary as their value. - - Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb - and (x2, y2) -> rt in OpenCV's coordinate space. - """ joints = np.bitwise_and(vertical, horizontal) tables = {} for c in contours: @@ -202,23 +100,6 @@ def find_table_joints(contours, vertical, horizontal): def remove_lines(threshold, line_scale=15): - """Removes lines from a thresholded image. - - Parameters - ---------- - threshold : object - numpy.ndarray representing the thresholded image. - - line_scale : int - Line scaling factor. - (optional, default: 15) - - Returns - ------- - threshold : object - numpy.ndarray representing the thresholded image - with horizontal and vertical lines removed. - """ size = threshold.shape[0] // line_scale vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) @@ -236,22 +117,6 @@ def remove_lines(threshold, line_scale=15): def find_cuts(threshold, char_scale=200): - """Finds cuts made by text projections on y-axis. - - Parameters - ---------- - threshold : object - numpy.ndarray representing the thresholded image. - - char_scale : int - Char scaling factor. - (optional, default: 200) - - Returns - ------- - y_cuts : list - List of cuts on y-axis. - """ size = threshold.shape[0] // char_scale char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) diff --git a/camelot/utils.py b/camelot/utils.py index 650e62a..df82a8a 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -19,61 +19,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal, def translate(x1, x2): - """Translates x2 by x1. - - Parameters - ---------- - x1 : float - - x2 : float - - Returns - ------- - x2 : float - """ x2 += x1 return x2 def scale(x, s): - """Scales x by scaling factor s. - - Parameters - ---------- - x : float - - s : float - - Returns - ------- - x : float - """ x *= s return x def rotate(x1, y1, x2, y2, angle): - """Rotates point x2, y2 about point x1, y1 by angle. - - Parameters - ---------- - x1 : float - - y1 : float - - x2 : float - - y2 : float - - angle : float - Angle in radians. - - Returns - ------- - xnew : float - - ynew : float - """ s = np.sin(angle) c = np.cos(angle) x2 = translate(-x1, x2) @@ -86,28 +41,6 @@ def rotate(x1, y1, x2, y2, angle): def scale_to_image(k, factors): - """Translates and scales PDFMiner coordinates to OpenCV's coordinate - space. - - Parameters - ---------- - k : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate - space. - - factors : tuple - Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the - first two elements are scaling factors and pdf_y is height of - pdf. - - Returns - ------- - knew : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate - space. - """ x1, y1, x2, y2 = k scaling_factor_x, scaling_factor_y, pdf_y = factors x1 = scale(x1, scaling_factor_x) @@ -119,34 +52,6 @@ def scale_to_image(k, factors): def scale_to_pdf(tables, v_segments, h_segments, factors): - """Translates and scales OpenCV coordinates to PDFMiner's coordinate - space. - - Parameters - ---------- - tables : dict - Dict with table boundaries as keys and list of intersections - in that boundary as their value. - - v_segments : list - List of vertical line segments. - - h_segments : list - List of horizontal line segments. - - factors : tuple - Tuple (scaling_factor_x, scaling_factor_y, img_y) where the - first two elements are scaling factors and img_y is height of - image. - - Returns - ------- - tables_new : dict - - v_segments_new : dict - - h_segments_new : dict - """ scaling_factor_x, scaling_factor_y, img_y = factors tables_new = {} for k in tables.keys(): @@ -179,12 +84,6 @@ def scale_to_pdf(tables, v_segments, h_segments, factors): def setup_logging(log_filepath): - """Setup logging - Args: - log_filepath (string): Path to log file - Returns: - logging.Logger: Logger object - """ logger = logging.getLogger("app_logger") logger.setLevel(logging.DEBUG) # Log File Handler (Associating one log file per webservice run) @@ -206,27 +105,6 @@ def setup_logging(log_filepath): def get_rotation(lttextlh, lttextlv, ltchar): - """Detects if text in table is vertical or not using the current - transformation matrix (CTM) and returns its orientation. - - Parameters - ---------- - lttextlh : list - List of PDFMiner LTTextLineHorizontal objects. - - lttextlv : list - List of PDFMiner LTTextLineVertical objects. - - ltchar : list - List of PDFMiner LTChar objects. - - Returns - ------- - rotation : string - {'', 'left', 'right'} - '' if text in table is upright, 'left' if rotated 90 degree - anti-clockwise and 'right' if rotated 90 degree clockwise. - """ rotation = '' hlen = len([t for t in lttextlh if t.get_text().strip()]) vlen = len([t for t in lttextlv if t.get_text().strip()]) @@ -238,29 +116,6 @@ def get_rotation(lttextlh, lttextlv, ltchar): def segments_bbox(bbox, v_segments, h_segments): - """Returns all line segments present inside a - table's bounding box. - - Parameters - ---------- - bbox : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space. - - v_segments : list - List of vertical line segments. - - h_segments : list - List of vertical horizontal segments. - - Returns - ------- - v_s : list - List of vertical line segments that lie inside table. - - h_s : list - List of horizontal line segments that lie inside table. - """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) v_s = [v for v in v_segments if v[1] > lb[1] - 2 and @@ -271,23 +126,6 @@ def segments_bbox(bbox, v_segments, h_segments): def text_in_bbox(bbox, text): - """Returns all text objects present inside a - table's bounding box. - - Parameters - ---------- - bbox : tuple - Tuple (x1, y1, x2, y2) representing table bounding box where - (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space. - - text : list - List of PDFMiner text objects. - - Returns - ------- - t_bbox : list - List of PDFMiner text objects that lie inside table. - """ lb = (bbox[0], bbox[1]) rt = (bbox[2], bbox[3]) t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 @@ -297,20 +135,6 @@ def text_in_bbox(bbox, text): def remove_close_values(ar, mtol=2): - """Removes values which are within a tolerance of mtol of another value - present in list. - - Parameters - ---------- - ar : list - - mtol : int - (optional, default: 2) - - Returns - ------- - ret : list - """ ret = [] for a in ar: if not ret: @@ -325,20 +149,6 @@ def remove_close_values(ar, mtol=2): def merge_close_values(ar, mtol=2): - """Merges values which are within a tolerance of mtol by calculating - a moving mean. - - Parameters - ---------- - ar : list - - mtol : int - (optional, default: 2) - - Returns - ------- - ret : list - """ ret = [] for a in ar: if not ret: @@ -354,22 +164,6 @@ def merge_close_values(ar, mtol=2): def flag_on_size(textline, direction): - """Flags a super/subscript by enclosing it with . May give - false positives. - - Parameters - ---------- - textline : list - List of PDFMiner LTChar objects. - - direction : string - {'horizontal', 'vertical'} - Direction of the PDFMiner LTTextLine object. - - Returns - ------- - fstring : string - """ if direction == 'horizontal': d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)] elif direction == 'vertical': @@ -396,33 +190,6 @@ def flag_on_size(textline, direction): def split_textline(table, textline, direction, flag_size=True): - """Splits PDFMiner LTTextLine into substrings if it spans across - multiple rows/columns. - - Parameters - ---------- - table : object - camelot.pdf.Pdf - - textline : object - PDFMiner LTTextLine object. - - direction : string - {'horizontal', 'vertical'} - Direction of the PDFMiner LTTextLine object. - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - Returns - ------- - grouped_chars : list - List of tuples of the form (idx, text) where idx is the index - of row/column and text is the an lttextline substring. - """ idx = 0 cut_text = [] bbox = textline.bbox @@ -474,47 +241,6 @@ def split_textline(table, textline, direction, flag_size=True): def get_table_index(table, t, direction, split_text=False, flag_size=True): - """Gets indices of the cell where given text object lies by - comparing their y and x-coordinates. - - Parameters - ---------- - table : object - camelot.table.Table - - t : object - PDFMiner LTTextLine object. - - direction : string - {'horizontal', 'vertical'} - Direction of the PDFMiner LTTextLine object. - - split_text : bool - Whether or not to split a text line if it spans across - multiple cells. - (optional, default: False) - - flag_size : bool - Whether or not to highlight a substring using - if its size is different from rest of the string, useful for - super and subscripts. - (optional, default: True) - - Returns - ------- - indices : list - List of tuples of the form (idx, text) where idx is the index - of row/column and text is the an lttextline substring. - - error : float - Assignment error, percentage of text area that lies outside - a cell. - +-------+ - | | - | [Text bounding box] - | | - +-------+ - """ r_idx, c_idx = [-1] * 2 for r in range(len(table.rows)): if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and @@ -558,20 +284,6 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True): def compute_accuracy(error_weights): - """Calculates score based on weights assigned to various parameters, - and their error percentages. - - Parameters - ---------- - error_weights : list - Two-dimensional list of the form [[p1, e1], [p2, e2], ...] - where pn is the weight assigned to list of errors en. - Sum of pn should be equal to 100. - - Returns - ------- - score : float - """ SCORE_VAL = 100 try: score = 0 @@ -587,16 +299,6 @@ def compute_accuracy(error_weights): def remove_empty(d): - """Removes empty rows and columns from a two-dimensional list. - - Parameters - ---------- - d : list - - Returns - ------- - d : list - """ for i, row in enumerate(d): if row == [''] * len(row): d.pop(i) @@ -607,23 +309,6 @@ def remove_empty(d): def count_empty(d): - """Counts empty rows and columns in a two-dimensional list. - - Parameters - ---------- - d : list - - Returns - ------- - n_empty_rows : list - Number of empty rows. - - n_empty_cols : list - Number of empty columns. - - empty_p : float - Percentage of empty cells. - """ empty_p = 0 r_nempty_cells, c_nempty_cells = [], [] for i in d: @@ -649,41 +334,11 @@ def count_empty(d): def encode_(ar): - """Encodes list of text. - - Parameters - ---------- - ar : list - - Returns - ------- - ar : list - """ ar = [[r.encode('utf-8') for r in row] for row in ar] return ar def get_text_objects(layout, ltype="char", t=None): - """Recursively parses pdf layout to get a list of - text objects. - - Parameters - ---------- - layout : object - PDFMiner LTPage object. - - ltype : string - {'char', 'lh', 'lv'} - Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal, - and LTTextLineVertical objects respectively. - - t : list - - Returns - ------- - t : list - List of PDFMiner text objects. - """ if ltype == "char": LTObject = LTChar elif ltype == "lh": @@ -705,33 +360,6 @@ def get_text_objects(layout, ltype="char", t=None): def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, detect_vertical=True, all_texts=True): - """Returns a PDFMiner LTPage object and page dimension of a single - page pdf. See https://euske.github.io/pdfminer/ to get definitions - of kwargs. - - Parameters - ---------- - pname : string - Path to pdf file. - - char_margin : float - - line_margin : float - - word_margin : float - - detect_vertical : bool - - all_texts : bool - - Returns - ------- - layout : object - PDFMiner LTPage object. - - dim : tuple - pdf page dimension of the form (width, height). - """ with open(pname, 'r') as f: parser = PDFParser(f) document = PDFDocument(parser) @@ -755,16 +383,6 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, def merge_tuples(tuples): - """Merges a list of overlapping tuples. - - Parameters - ---------- - tuples : list - - Returns - ------- - merged : list - """ merged = list(tuples[0]) for s, e in tuples: if s <= merged[1]: