Clean up notebooks, address review comments

* Improve explanations of network, hybrid, and lattice parsers * Remove dead code from parser comparison notebook * Clean-up notebook variables to reduce size and make diffs cleaner * Revert changes that were peripheral to the core changes
2020-07-03 18:28:24 -07:00 · 2020-07-03 18:28:24 -07:00 · 42f8321c8c
parent 71805f9333
commit 42f8321c8c
6 changed files with 322 additions and 331 deletions
--- a/.deepsource.toml
+++ b/.deepsource.toml
@ -1,10 +1,5 @@
 version = 1
 test_patterns = [
  "tests/**",
  "test_*.py"
 ]
 exclude_patterns = [
  "camelot/ext/**"
 ]
--- a/.gitignore
+++ b/.gitignore
@ -4,10 +4,8 @@ __pycache__/
 build/
 dist/
 prof/
 *.egg-info/
 .eggs/
 .tox/
 .coverage
 coverage.xml
@ -19,5 +17,3 @@ htmlcov/
 # vscode
 .vscode
 .DS_Store
--- a/.travis.yml
+++ b/.travis.yml
@ -1,3 +1,4 @@
 sudo: true
 language: python
 cache: pip
 addons:
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -4,11 +4,8 @@ import cv2
 import numpy as np
-def adaptive_threshold(
+def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
        imagename, process_background=False,
        blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
    Parameters
    ----------
    imagename : string
@ -18,31 +15,24 @@ def adaptive_threshold(
    blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
-
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
-
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
    Returns
    -------
    img : object
        numpy.ndarray representing the original image.
    threshold : object
        numpy.ndarray representing the thresholded image.
    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if process_background:
        threshold = cv2.adaptiveThreshold(
-            gray,
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, blocksize, c
        )
    else:
        threshold = cv2.adaptiveThreshold(
@ -57,12 +47,10 @@ def adaptive_threshold(
 def find_lines(
-    threshold, regions=None,
+    threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
    direction="horizontal", line_scale=15, iterations=0
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
    Parameters
    ----------
    threshold : object
@ -76,14 +64,11 @@ def find_lines(
    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
-
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.  # noqa
    Returns
    -------
    dmask : object
@ -93,7 +78,6 @@ def find_lines(
        List of tuples representing vertical/horizontal lines with
        coordinates relative to a left-top origin in
        image coordinate space.
    """
    lines = []
@ -104,15 +88,13 @@ def find_lines(
        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
-        raise ValueError(
+        raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
            "Specify direction as either 'vertical' or 'horizontal'"
        )
    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
-            region_mask[y:y + h, x:x + w] = 1
+            region_mask[y : y + h, x : x + w] = 1
        threshold = np.multiply(threshold, region_mask)
    threshold = cv2.erode(threshold, el)
@ -121,14 +103,12 @@ def find_lines(
    try:
        _, contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            cv2.CHAIN_APPROX_SIMPLE
        )
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
            cv2.CHAIN_APPROX_SIMPLE
        )
    for c in contours:
@ -145,21 +125,18 @@ def find_lines(
 def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.
    Parameters
    ----------
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    cont : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in image coordinate space.
    """
    mask = vertical + horizontal
@ -185,7 +162,6 @@ def find_contours(vertical, horizontal):
 def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.
    Parameters
    ----------
    contours : list
@ -196,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
    Returns
    -------
    tables : dict
@ -204,13 +179,12 @@ def find_joints(contours, vertical, horizontal):
        in that boundary as their value.
        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
        and (x2, y2) -> rt in image coordinate space.
    """
    joints = np.multiply(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
-        roi = joints[y:y + h, x:x + w]
+        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb