Remove docstrings

2018-09-05 19:04:40 +05:30
parent 08cbababca
commit bf63432494
2 changed files with 0 additions and 517 deletions
@@ -8,35 +8,6 @@ from .utils import merge_tuples


 def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
-    """Thresholds an image using OpenCV's adaptiveThreshold.
-
-    Parameters
-    ----------
-    imagename : string
-        Path to image file.
-
-    invert : bool
-        Whether or not to invert the image. Useful when pdfs have
-        tables with lines in background.
-        (optional, default: False)
-
-    blocksize: int
-        Size of a pixel neighborhood that is used to calculate a
-        threshold value for the pixel: 3, 5, 7, and so on.
-
-    c: float
-        Constant subtracted from the mean or weighted mean
-        (see the details below). Normally, it is positive but may be
-        zero or negative as well.
-
-    Returns
-    -------
-    img : object
-        numpy.ndarray representing the original image.
-
-    threshold : object
-        numpy.ndarray representing the thresholded image.
-    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

@@ -50,38 +21,6 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):


 def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
-    """Finds horizontal and vertical lines by applying morphological
-    transformations on an image.
-
-    Parameters
-    ----------
-    threshold : object
-        numpy.ndarray representing the thresholded image.
-
-    direction : string
-        Specifies whether to find vertical or horizontal lines.
-        (default: 'horizontal')
-
-    scale : int
-        Used to divide the height/width to get a structuring element
-        for morph transform.
-        (optional, default: 15)
-
-    iterations : int
-        Number of iterations for dilation.
-        (optional, default: 2)
-
-    Returns
-    -------
-    dmask : object
-        numpy.ndarray representing pixels where vertical/horizontal
-        lines lie.
-
-    lines : list
-        List of tuples representing vertical/horizontal lines with
-        coordinates relative to a left-top origin in
-        OpenCV's coordinate space.
-    """
    lines = []

    if direction == 'vertical':
@@ -118,23 +57,6 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):


 def find_table_contours(vertical, horizontal):
-    """Finds table boundaries using OpenCV's findContours.
-
-    Parameters
-    ----------
-    vertical : object
-        numpy.ndarray representing pixels where vertical lines lie.
-
-    horizontal : object
-        numpy.ndarray representing pixels where horizontal lines lie.
-
-    Returns
-    -------
-    cont : list
-        List of tuples representing table boundaries. Each tuple is of
-        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
-        h -> height in OpenCV's coordinate space.
-    """
    mask = vertical + horizontal

    try:
@@ -154,30 +76,6 @@ def find_table_contours(vertical, horizontal):


 def find_table_joints(contours, vertical, horizontal):
-    """Finds joints/intersections present inside each table boundary.
-
-    Parameters
-    ----------
-    contours : list
-        List of tuples representing table boundaries. Each tuple is of
-        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
-        h -> height in OpenCV's coordinate space.
-
-    vertical : object
-        numpy.ndarray representing pixels where vertical lines lie.
-
-    horizontal : object
-        numpy.ndarray representing pixels where horizontal lines lie.
-
-    Returns
-    -------
-    tables : dict
-        Dict with table boundaries as keys and list of intersections
-        in that boundary as their value.
-
-        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
-        and (x2, y2) -> rt in OpenCV's coordinate space.
-    """
    joints = np.bitwise_and(vertical, horizontal)
    tables = {}
    for c in contours:
@@ -202,23 +100,6 @@ def find_table_joints(contours, vertical, horizontal):


 def remove_lines(threshold, line_scale=15):
-    """Removes lines from a thresholded image.
-
-    Parameters
-    ----------
-    threshold : object
-        numpy.ndarray representing the thresholded image.
-
-    line_scale : int
-        Line scaling factor.
-        (optional, default: 15)
-
-    Returns
-    -------
-    threshold : object
-        numpy.ndarray representing the thresholded image
-        with horizontal and vertical lines removed.
-    """
    size = threshold.shape[0] // line_scale
    vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
@@ -236,22 +117,6 @@ def remove_lines(threshold, line_scale=15):


 def find_cuts(threshold, char_scale=200):
-    """Finds cuts made by text projections on y-axis.
-
-    Parameters
-    ----------
-    threshold : object
-        numpy.ndarray representing the thresholded image.
-
-    char_scale : int
-        Char scaling factor.
-        (optional, default: 200)
-
-    Returns
-    -------
-    y_cuts : list
-        List of cuts on y-axis.
-    """
    size = threshold.shape[0] // char_scale
    char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))

@@ -19,61 +19,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,


 def translate(x1, x2):
-    """Translates x2 by x1.
-
-    Parameters
-    ----------
-    x1 : float
-
-    x2 : float
-
-    Returns
-    -------
-    x2 : float
-    """
    x2 += x1
    return x2


 def scale(x, s):
-    """Scales x by scaling factor s.
-
-    Parameters
-    ----------
-    x : float
-
-    s : float
-
-    Returns
-    -------
-    x : float
-    """
    x *= s
    return x


 def rotate(x1, y1, x2, y2, angle):
-    """Rotates point x2, y2 about point x1, y1 by angle.
-
-    Parameters
-    ----------
-    x1 : float
-
-    y1 : float
-
-    x2 : float
-
-    y2 : float
-
-    angle : float
-        Angle in radians.
-
-    Returns
-    -------
-    xnew : float
-
-    ynew : float
-    """
    s = np.sin(angle)
    c = np.cos(angle)
    x2 = translate(-x1, x2)
@@ -86,28 +41,6 @@ def rotate(x1, y1, x2, y2, angle):


 def scale_to_image(k, factors):
-    """Translates and scales PDFMiner coordinates to OpenCV's coordinate
-    space.
-
-    Parameters
-    ----------
-    k : tuple
-        Tuple (x1, y1, x2, y2) representing table bounding box where
-        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
-        space.
-
-    factors : tuple
-        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
-        first two elements are scaling factors and pdf_y is height of
-        pdf.
-
-    Returns
-    -------
-    knew : tuple
-        Tuple (x1, y1, x2, y2) representing table bounding box where
-        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
-        space.
-    """
    x1, y1, x2, y2 = k
    scaling_factor_x, scaling_factor_y, pdf_y = factors
    x1 = scale(x1, scaling_factor_x)
@@ -119,34 +52,6 @@ def scale_to_image(k, factors):


 def scale_to_pdf(tables, v_segments, h_segments, factors):
-    """Translates and scales OpenCV coordinates to PDFMiner's coordinate
-    space.
-
-    Parameters
-    ----------
-    tables : dict
-        Dict with table boundaries as keys and list of intersections
-        in that boundary as their value.
-
-    v_segments : list
-        List of vertical line segments.
-
-    h_segments : list
-        List of horizontal line segments.
-
-    factors : tuple
-        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
-        first two elements are scaling factors and img_y is height of
-        image.
-
-    Returns
-    -------
-    tables_new : dict
-
-    v_segments_new : dict
-
-    h_segments_new : dict
-    """
    scaling_factor_x, scaling_factor_y, img_y = factors
    tables_new = {}
    for k in tables.keys():
@@ -179,12 +84,6 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):


 def setup_logging(log_filepath):
-    """Setup logging
-    Args:
-        log_filepath (string): Path to log file
-    Returns:
-        logging.Logger: Logger object
-    """
    logger = logging.getLogger("app_logger")
    logger.setLevel(logging.DEBUG)
    # Log File Handler (Associating one log file per webservice run)
@@ -206,27 +105,6 @@ def setup_logging(log_filepath):


 def get_rotation(lttextlh, lttextlv, ltchar):
-    """Detects if text in table is vertical or not using the current
-    transformation matrix (CTM) and returns its orientation.
-
-    Parameters
-    ----------
-    lttextlh : list
-        List of PDFMiner LTTextLineHorizontal objects.
-
-    lttextlv : list
-        List of PDFMiner LTTextLineVertical objects.
-
-    ltchar : list
-        List of PDFMiner LTChar objects.
-
-    Returns
-    -------
-    rotation : string
-        {'', 'left', 'right'}
-        '' if text in table is upright, 'left' if rotated 90 degree
-        anti-clockwise and 'right' if rotated 90 degree clockwise.
-    """
    rotation = ''
    hlen = len([t for t in lttextlh if t.get_text().strip()])
    vlen = len([t for t in lttextlv if t.get_text().strip()])
@@ -238,29 +116,6 @@ def get_rotation(lttextlh, lttextlv, ltchar):


 def segments_bbox(bbox, v_segments, h_segments):
-    """Returns all line segments present inside a
-    table's bounding box.
-
-    Parameters
-    ----------
-    bbox : tuple
-        Tuple (x1, y1, x2, y2) representing table bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
-
-    v_segments : list
-        List of vertical line segments.
-
-    h_segments : list
-        List of vertical horizontal segments.
-
-    Returns
-    -------
-    v_s : list
-        List of vertical line segments that lie inside table.
-
-    h_s : list
-        List of horizontal line segments that lie inside table.
-    """
    lb = (bbox[0], bbox[1])
    rt = (bbox[2], bbox[3])
    v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
@@ -271,23 +126,6 @@ def segments_bbox(bbox, v_segments, h_segments):


 def text_in_bbox(bbox, text):
-    """Returns all text objects present inside a
-    table's bounding box.
-
-    Parameters
-    ----------
-    bbox : tuple
-        Tuple (x1, y1, x2, y2) representing table bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
-
-    text : list
-        List of PDFMiner text objects.
-
-    Returns
-    -------
-    t_bbox : list
-        List of PDFMiner text objects that lie inside table.
-    """
    lb = (bbox[0], bbox[1])
    rt = (bbox[2], bbox[3])
    t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
@@ -297,20 +135,6 @@ def text_in_bbox(bbox, text):


 def remove_close_values(ar, mtol=2):
-    """Removes values which are within a tolerance of mtol of another value
-    present in list.
-
-    Parameters
-    ----------
-    ar : list
-
-    mtol : int
-        (optional, default: 2)
-
-    Returns
-    -------
-    ret : list
-    """
    ret = []
    for a in ar:
        if not ret:
@@ -325,20 +149,6 @@ def remove_close_values(ar, mtol=2):


 def merge_close_values(ar, mtol=2):
-    """Merges values which are within a tolerance of mtol by calculating
-    a moving mean.
-
-    Parameters
-    ----------
-    ar : list
-
-    mtol : int
-        (optional, default: 2)
-
-    Returns
-    -------
-    ret : list
-    """
    ret = []
    for a in ar:
        if not ret:
@@ -354,22 +164,6 @@ def merge_close_values(ar, mtol=2):


 def flag_on_size(textline, direction):
-    """Flags a super/subscript by enclosing it with <s></s>. May give
-    false positives.
-
-    Parameters
-    ----------
-    textline : list
-        List of PDFMiner LTChar objects.
-
-    direction : string
-        {'horizontal', 'vertical'}
-        Direction of the PDFMiner LTTextLine object.
-
-    Returns
-    -------
-    fstring : string
-    """
    if direction == 'horizontal':
        d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
    elif direction == 'vertical':
@@ -396,33 +190,6 @@ def flag_on_size(textline, direction):


 def split_textline(table, textline, direction, flag_size=True):
-    """Splits PDFMiner LTTextLine into substrings if it spans across
-    multiple rows/columns.
-
-    Parameters
-    ----------
-    table : object
-        camelot.pdf.Pdf
-
-    textline : object
-        PDFMiner LTTextLine object.
-
-    direction : string
-        {'horizontal', 'vertical'}
-        Direction of the PDFMiner LTTextLine object.
-
-    flag_size : bool
-        Whether or not to highlight a substring using <s></s>
-        if its size is different from rest of the string, useful for
-        super and subscripts.
-        (optional, default: True)
-
-    Returns
-    -------
-    grouped_chars : list
-        List of tuples of the form (idx, text) where idx is the index
-        of row/column and text is the an lttextline substring.
-    """
    idx = 0
    cut_text = []
    bbox = textline.bbox
@@ -474,47 +241,6 @@ def split_textline(table, textline, direction, flag_size=True):


 def get_table_index(table, t, direction, split_text=False, flag_size=True):
-    """Gets indices of the cell where given text object lies by
-    comparing their y and x-coordinates.
-
-    Parameters
-    ----------
-    table : object
-        camelot.table.Table
-
-    t : object
-        PDFMiner LTTextLine object.
-
-    direction : string
-        {'horizontal', 'vertical'}
-        Direction of the PDFMiner LTTextLine object.
-
-    split_text : bool
-        Whether or not to split a text line if it spans across
-        multiple cells.
-        (optional, default: False)
-
-    flag_size : bool
-        Whether or not to highlight a substring using <s></s>
-        if its size is different from rest of the string, useful for
-        super and subscripts.
-        (optional, default: True)
-
-    Returns
-    -------
-    indices : list
-        List of tuples of the form (idx, text) where idx is the index
-        of row/column and text is the an lttextline substring.
-
-    error : float
-        Assignment error, percentage of text area that lies outside
-        a cell.
-        +-------+
-        |       |
-        |   [Text bounding box]
-        |       |
-        +-------+
-    """
    r_idx, c_idx = [-1] * 2
    for r in range(len(table.rows)):
        if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
@@ -558,20 +284,6 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):


 def compute_accuracy(error_weights):
-    """Calculates score based on weights assigned to various parameters,
-    and their error percentages.
-
-    Parameters
-    ----------
-    error_weights : list
-        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
-        where pn is the weight assigned to list of errors en.
-        Sum of pn should be equal to 100.
-
-    Returns
-    -------
-    score : float
-    """
    SCORE_VAL = 100
    try:
        score = 0
@@ -587,16 +299,6 @@ def compute_accuracy(error_weights):


 def remove_empty(d):
-    """Removes empty rows and columns from a two-dimensional list.
-
-    Parameters
-    ----------
-    d : list
-
-    Returns
-    -------
-    d : list
-    """
    for i, row in enumerate(d):
        if row == [''] * len(row):
            d.pop(i)
@@ -607,23 +309,6 @@ def remove_empty(d):


 def count_empty(d):
-    """Counts empty rows and columns in a two-dimensional list.
-
-    Parameters
-    ----------
-    d : list
-
-    Returns
-    -------
-    n_empty_rows : list
-        Number of empty rows.
-
-    n_empty_cols : list
-        Number of empty columns.
-
-    empty_p : float
-        Percentage of empty cells.
-    """
    empty_p = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
@@ -649,41 +334,11 @@ def count_empty(d):


 def encode_(ar):
-    """Encodes list of text.
-
-    Parameters
-    ----------
-    ar : list
-
-    Returns
-    -------
-    ar : list
-    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar


 def get_text_objects(layout, ltype="char", t=None):
-    """Recursively parses pdf layout to get a list of
-    text objects.
-
-    Parameters
-    ----------
-    layout : object
-        PDFMiner LTPage object.
-
-    ltype : string
-        {'char', 'lh', 'lv'}
-        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
-        and LTTextLineVertical objects respectively.
-
-    t : list
-
-    Returns
-    -------
-    t : list
-        List of PDFMiner text objects.
-    """
    if ltype == "char":
        LTObject = LTChar
    elif ltype == "lh":
@@ -705,33 +360,6 @@ def get_text_objects(layout, ltype="char", t=None):

 def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
               detect_vertical=True, all_texts=True):
-    """Returns a PDFMiner LTPage object and page dimension of a single
-    page pdf. See https://euske.github.io/pdfminer/ to get definitions
-    of kwargs.
-
-    Parameters
-    ----------
-    pname : string
-        Path to pdf file.
-
-    char_margin : float
-
-    line_margin : float
-
-    word_margin : float
-
-    detect_vertical : bool
-
-    all_texts : bool
-
-    Returns
-    -------
-    layout : object
-        PDFMiner LTPage object.
-
-    dim : tuple
-        pdf page dimension of the form (width, height).
-    """
    with open(pname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
@@ -755,16 +383,6 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,


 def merge_tuples(tuples):
-    """Merges a list of overlapping tuples.
-
-    Parameters
-    ----------
-    tuples : list
-
-    Returns
-    -------
-    merged : list
-    """
    merged = list(tuples[0])
    for s, e in tuples:
        if s <= merged[1]: