Remove docstrings

2018-09-05 19:04:40 +05:30 · 2018-09-05 19:04:40 +05:30 · bf63432494
parent 08cbababca
commit bf63432494
2 changed files with 0 additions and 517 deletions
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -8,35 +8,6 @@ from .utils import merge_tuples
 def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
    ----------
    imagename : string
        Path to image file.
    invert : bool
        Whether or not to invert the image. Useful when pdfs have
        tables with lines in background.
        (optional, default: False)
    blocksize: int
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
    c: float
        Constant subtracted from the mean or weighted mean
        (see the details below). Normally, it is positive but may be
        zero or negative as well.
    Returns
    -------
    img : object
        numpy.ndarray representing the original image.
    threshold : object
        numpy.ndarray representing the thresholded image.
    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
@ -50,38 +21,6 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
 def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
    Parameters
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
    direction : string
        Specifies whether to find vertical or horizontal lines.
        (default: 'horizontal')
    scale : int
        Used to divide the height/width to get a structuring element
        for morph transform.
        (optional, default: 15)
    iterations : int
        Number of iterations for dilation.
        (optional, default: 2)
    Returns
    -------
    dmask : object
        numpy.ndarray representing pixels where vertical/horizontal
        lines lie.
    lines : list
        List of tuples representing vertical/horizontal lines with
        coordinates relative to a left-top origin in
        OpenCV's coordinate space.
    """
    lines = []
    if direction == 'vertical':
@ -118,23 +57,6 @@ def find_lines(threshold, direction='horizontal', scale=15, iterations=0):
 def find_table_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.
    Parameters
    ----------
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    cont : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in OpenCV's coordinate space.
    """
    mask = vertical + horizontal
    try:
@ -154,30 +76,6 @@ def find_table_contours(vertical, horizontal):
 def find_table_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.
    Parameters
    ----------
    contours : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in OpenCV's coordinate space.
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as their value.
        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
        and (x2, y2) -> rt in OpenCV's coordinate space.
    """
    joints = np.bitwise_and(vertical, horizontal)
    tables = {}
    for c in contours:
@ -202,23 +100,6 @@ def find_table_joints(contours, vertical, horizontal):
 def remove_lines(threshold, line_scale=15):
    """Removes lines from a thresholded image.
    Parameters
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
    line_scale : int
        Line scaling factor.
        (optional, default: 15)
    Returns
    -------
    threshold : object
        numpy.ndarray representing the thresholded image
        with horizontal and vertical lines removed.
    """
    size = threshold.shape[0] // line_scale
    vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
@ -236,22 +117,6 @@ def remove_lines(threshold, line_scale=15):
 def find_cuts(threshold, char_scale=200):
    """Finds cuts made by text projections on y-axis.
    Parameters
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.
    char_scale : int
        Char scaling factor.
        (optional, default: 200)
    Returns
    -------
    y_cuts : list
        List of cuts on y-axis.
    """
    size = threshold.shape[0] // char_scale
    char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -19,61 +19,16 @@ from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
 def translate(x1, x2):
    """Translates x2 by x1.
    Parameters
    ----------
    x1 : float
    x2 : float
    Returns
    -------
    x2 : float
    """
    x2 += x1
    return x2
 def scale(x, s):
    """Scales x by scaling factor s.
    Parameters
    ----------
    x : float
    s : float
    Returns
    -------
    x : float
    """
    x *= s
    return x
 def rotate(x1, y1, x2, y2, angle):
    """Rotates point x2, y2 about point x1, y1 by angle.
    Parameters
    ----------
    x1 : float
    y1 : float
    x2 : float
    y2 : float
    angle : float
        Angle in radians.
    Returns
    -------
    xnew : float
    ynew : float
    """
    s = np.sin(angle)
    c = np.cos(angle)
    x2 = translate(-x1, x2)
@ -86,28 +41,6 @@ def rotate(x1, y1, x2, y2, angle):
 def scale_to_image(k, factors):
    """Translates and scales PDFMiner coordinates to OpenCV's coordinate
    space.
    Parameters
    ----------
    k : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner's coordinate
        space.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
        first two elements are scaling factors and pdf_y is height of
        pdf.
    Returns
    -------
    knew : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV's coordinate
        space.
    """
    x1, y1, x2, y2 = k
    scaling_factor_x, scaling_factor_y, pdf_y = factors
    x1 = scale(x1, scaling_factor_x)
@ -119,34 +52,6 @@ def scale_to_image(k, factors):
 def scale_to_pdf(tables, v_segments, h_segments, factors):
    """Translates and scales OpenCV coordinates to PDFMiner's coordinate
    space.
    Parameters
    ----------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as their value.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of horizontal line segments.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
        first two elements are scaling factors and img_y is height of
        image.
    Returns
    -------
    tables_new : dict
    v_segments_new : dict
    h_segments_new : dict
    """
    scaling_factor_x, scaling_factor_y, img_y = factors
    tables_new = {}
    for k in tables.keys():
@ -179,12 +84,6 @@ def scale_to_pdf(tables, v_segments, h_segments, factors):
 def setup_logging(log_filepath):
    """Setup logging
    Args:
        log_filepath (string): Path to log file
    Returns:
        logging.Logger: Logger object
    """
    logger = logging.getLogger("app_logger")
    logger.setLevel(logging.DEBUG)
    # Log File Handler (Associating one log file per webservice run)
@ -206,27 +105,6 @@ def setup_logging(log_filepath):
 def get_rotation(lttextlh, lttextlv, ltchar):
    """Detects if text in table is vertical or not using the current
    transformation matrix (CTM) and returns its orientation.
    Parameters
    ----------
    lttextlh : list
        List of PDFMiner LTTextLineHorizontal objects.
    lttextlv : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.
    Returns
    -------
    rotation : string
        {'', 'left', 'right'}
        '' if text in table is upright, 'left' if rotated 90 degree
        anti-clockwise and 'right' if rotated 90 degree clockwise.
    """
    rotation = ''
    hlen = len([t for t in lttextlh if t.get_text().strip()])
    vlen = len([t for t in lttextlv if t.get_text().strip()])
@ -238,29 +116,6 @@ def get_rotation(lttextlh, lttextlv, ltchar):
 def segments_bbox(bbox, v_segments, h_segments):
    """Returns all line segments present inside a
    table's bounding box.
    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of vertical horizontal segments.
    Returns
    -------
    v_s : list
        List of vertical line segments that lie inside table.
    h_s : list
        List of horizontal line segments that lie inside table.
    """
    lb = (bbox[0], bbox[1])
    rt = (bbox[2], bbox[3])
    v_s = [v for v in v_segments if v[1] > lb[1] - 2 and
@ -271,23 +126,6 @@ def segments_bbox(bbox, v_segments, h_segments):
 def text_in_bbox(bbox, text):
    """Returns all text objects present inside a
    table's bounding box.
    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner's coordinate space.
    text : list
        List of PDFMiner text objects.
    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects that lie inside table.
    """
    lb = (bbox[0], bbox[1])
    rt = (bbox[2], bbox[3])
    t_bbox = [t for t in text if lb[0] - 2 <= (t.x0 + t.x1) / 2.0
@ -297,20 +135,6 @@ def text_in_bbox(bbox, text):
 def remove_close_values(ar, mtol=2):
    """Removes values which are within a tolerance of mtol of another value
    present in list.
    Parameters
    ----------
    ar : list
    mtol : int
        (optional, default: 2)
    Returns
    -------
    ret : list
    """
    ret = []
    for a in ar:
        if not ret:
@ -325,20 +149,6 @@ def remove_close_values(ar, mtol=2):
 def merge_close_values(ar, mtol=2):
    """Merges values which are within a tolerance of mtol by calculating
    a moving mean.
    Parameters
    ----------
    ar : list
    mtol : int
        (optional, default: 2)
    Returns
    -------
    ret : list
    """
    ret = []
    for a in ar:
        if not ret:
@ -354,22 +164,6 @@ def merge_close_values(ar, mtol=2):
 def flag_on_size(textline, direction):
    """Flags a super/subscript by enclosing it with <s></s>. May give
    false positives.
    Parameters
    ----------
    textline : list
        List of PDFMiner LTChar objects.
    direction : string
        {'horizontal', 'vertical'}
        Direction of the PDFMiner LTTextLine object.
    Returns
    -------
    fstring : string
    """
    if direction == 'horizontal':
        d = [(t.get_text(), np.round(t.height, decimals=6)) for t in textline if not isinstance(t, LTAnno)]
    elif direction == 'vertical':
@ -396,33 +190,6 @@ def flag_on_size(textline, direction):
 def split_textline(table, textline, direction, flag_size=True):
    """Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.
    Parameters
    ----------
    table : object
        camelot.pdf.Pdf
    textline : object
        PDFMiner LTTextLine object.
    direction : string
        {'horizontal', 'vertical'}
        Direction of the PDFMiner LTTextLine object.
    flag_size : bool
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
        (optional, default: True)
    Returns
    -------
    grouped_chars : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.
    """
    idx = 0
    cut_text = []
    bbox = textline.bbox
@ -474,47 +241,6 @@ def split_textline(table, textline, direction, flag_size=True):
 def get_table_index(table, t, direction, split_text=False, flag_size=True):
    """Gets indices of the cell where given text object lies by
    comparing their y and x-coordinates.
    Parameters
    ----------
    table : object
        camelot.table.Table
    t : object
        PDFMiner LTTextLine object.
    direction : string
        {'horizontal', 'vertical'}
        Direction of the PDFMiner LTTextLine object.
    split_text : bool
        Whether or not to split a text line if it spans across
        multiple cells.
        (optional, default: False)
    flag_size : bool
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string, useful for
        super and subscripts.
        (optional, default: True)
    Returns
    -------
    indices : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+
    """
    r_idx, c_idx = [-1] * 2
    for r in range(len(table.rows)):
        if ((t.y0 + t.y1) / 2.0 < table.rows[r][0] and
@ -558,20 +284,6 @@ def get_table_index(table, t, direction, split_text=False, flag_size=True):
 def compute_accuracy(error_weights):
    """Calculates score based on weights assigned to various parameters,
    and their error percentages.
    Parameters
    ----------
    error_weights : list
        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
        where pn is the weight assigned to list of errors en.
        Sum of pn should be equal to 100.
    Returns
    -------
    score : float
    """
    SCORE_VAL = 100
    try:
        score = 0
@ -587,16 +299,6 @@ def compute_accuracy(error_weights):
 def remove_empty(d):
    """Removes empty rows and columns from a two-dimensional list.
    Parameters
    ----------
    d : list
    Returns
    -------
    d : list
    """
    for i, row in enumerate(d):
        if row == [''] * len(row):
            d.pop(i)
@ -607,23 +309,6 @@ def remove_empty(d):
 def count_empty(d):
    """Counts empty rows and columns in a two-dimensional list.
    Parameters
    ----------
    d : list
    Returns
    -------
    n_empty_rows : list
        Number of empty rows.
    n_empty_cols : list
        Number of empty columns.
    empty_p : float
        Percentage of empty cells.
    """
    empty_p = 0
    r_nempty_cells, c_nempty_cells = [], []
    for i in d:
@ -649,41 +334,11 @@ def count_empty(d):
 def encode_(ar):
    """Encodes list of text.
    Parameters
    ----------
    ar : list
    Returns
    -------
    ar : list
    """
    ar = [[r.encode('utf-8') for r in row] for row in ar]
    return ar
 def get_text_objects(layout, ltype="char", t=None):
    """Recursively parses pdf layout to get a list of
    text objects.
    Parameters
    ----------
    layout : object
        PDFMiner LTPage object.
    ltype : string
        {'char', 'lh', 'lv'}
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
    t : list
    Returns
    -------
    t : list
        List of PDFMiner text objects.
    """
    if ltype == "char":
        LTObject = LTChar
    elif ltype == "lh":
@ -705,33 +360,6 @@ def get_text_objects(layout, ltype="char", t=None):
 def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
               detect_vertical=True, all_texts=True):
    """Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. See https://euske.github.io/pdfminer/ to get definitions
    of kwargs.
    Parameters
    ----------
    pname : string
        Path to pdf file.
    char_margin : float
    line_margin : float
    word_margin : float
    detect_vertical : bool
    all_texts : bool
    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        pdf page dimension of the form (width, height).
    """
    with open(pname, 'r') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
@ -755,16 +383,6 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
 def merge_tuples(tuples):
    """Merges a list of overlapping tuples.
    Parameters
    ----------
    tuples : list
    Returns
    -------
    merged : list
    """
    merged = list(tuples[0])
    for s, e in tuples:
        if s <= merged[1]: