Clean up notebooks, address review comments

* Improve explanations of network, hybrid, and lattice parsers * Remove dead code from parser comparison notebook * Clean-up notebook variables to reduce size and make diffs cleaner * Revert changes that were peripheral to the core changes
2020-07-03 18:28:24 -07:00 · 2020-07-03 18:28:24 -07:00 · 42f8321c8c
parent 71805f9333
commit 42f8321c8c
6 changed files with 322 additions and 331 deletions
--- a/.deepsource.toml
+++ b/.deepsource.toml
@ -1,10 +1,5 @@
 version = 1

-test_patterns = [
-  "tests/**",
-  "test_*.py"
-]
-
 exclude_patterns = [
  "camelot/ext/**"
 ]
--- a/.gitignore
+++ b/.gitignore
@ -4,10 +4,8 @@ __pycache__/

 build/
 dist/
-prof/
 *.egg-info/
 .eggs/
-.tox/
 .coverage
 coverage.xml

@ -18,6 +16,4 @@ _build/
 htmlcov/

 # vscode
-.vscode
-
-.DS_Store
+.vscode
--- a/.travis.yml
+++ b/.travis.yml
@ -1,3 +1,4 @@
+sudo: true
 language: python
 cache: pip
 addons:
--- a/camelot/image_processing.py
+++ b/camelot/image_processing.py
@ -4,11 +4,8 @@ import cv2
 import numpy as np


-def adaptive_threshold(
-        imagename, process_background=False,
-        blocksize=15, c=-2):
+def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
-
    Parameters
    ----------
    imagename : string
@ -18,31 +15,24 @@ def adaptive_threshold(
    blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
-
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    c : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.
-
-        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.  # noqa
-
+        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    Returns
    -------
    img : object
        numpy.ndarray representing the original image.
    threshold : object
        numpy.ndarray representing the thresholded image.
-
    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    if process_background:
        threshold = cv2.adaptiveThreshold(
-            gray,
-            255,
-            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY, blocksize, c
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
        )
    else:
        threshold = cv2.adaptiveThreshold(
@ -57,12 +47,10 @@ def adaptive_threshold(


 def find_lines(
-    threshold, regions=None,
-    direction="horizontal", line_scale=15, iterations=0
+    threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
 ):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.
-
    Parameters
    ----------
    threshold : object
@ -76,14 +64,11 @@ def find_lines(
    line_scale : int, optional (default: 15)
        Factor by which the page dimensions will be divided to get
        smallest length of lines that should be detected.
-
        The larger this value, smaller the detected lines. Making it
        too large will lead to text being detected as lines.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.
-
-        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.  # noqa
-
+        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    Returns
    -------
    dmask : object
@ -93,7 +78,6 @@ def find_lines(
        List of tuples representing vertical/horizontal lines with
        coordinates relative to a left-top origin in
        image coordinate space.
-
    """
    lines = []

@ -104,15 +88,13 @@ def find_lines(
        size = threshold.shape[1] // line_scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
-        raise ValueError(
-            "Specify direction as either 'vertical' or 'horizontal'"
-        )
+        raise ValueError("Specify direction as either 'vertical' or 'horizontal'")

    if regions is not None:
        region_mask = np.zeros(threshold.shape)
        for region in regions:
            x, y, w, h = region
-            region_mask[y:y + h, x:x + w] = 1
+            region_mask[y : y + h, x : x + w] = 1
        threshold = np.multiply(threshold, region_mask)

    threshold = cv2.erode(threshold, el)
@ -121,14 +103,12 @@ def find_lines(

    try:
        _, contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
-            cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
    except ValueError:
        # for opencv backward compatibility
        contours, _ = cv2.findContours(
-            threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
-            cv2.CHAIN_APPROX_SIMPLE
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

    for c in contours:
@ -145,21 +125,18 @@ def find_lines(

 def find_contours(vertical, horizontal):
    """Finds table boundaries using OpenCV's findContours.
-
    Parameters
    ----------
    vertical : object
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
-
    Returns
    -------
    cont : list
        List of tuples representing table boundaries. Each tuple is of
        the form (x, y, w, h) where (x, y) -> left-top, w -> width and
        h -> height in image coordinate space.
-
    """
    mask = vertical + horizontal

@ -185,7 +162,6 @@ def find_contours(vertical, horizontal):

 def find_joints(contours, vertical, horizontal):
    """Finds joints/intersections present inside each table boundary.
-
    Parameters
    ----------
    contours : list
@ -196,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
        numpy.ndarray representing pixels where vertical lines lie.
    horizontal : object
        numpy.ndarray representing pixels where horizontal lines lie.
-
    Returns
    -------
    tables : dict
@ -204,13 +179,12 @@ def find_joints(contours, vertical, horizontal):
        in that boundary as their value.
        Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
        and (x2, y2) -> rt in image coordinate space.
-
    """
    joints = np.multiply(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
-        roi = joints[y:y + h, x:x + w]
+        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
@ -229,4 +203,4 @@ def find_joints(contours, vertical, horizontal):
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords

-    return tables
+    return tables
--- a/notebook-hybrid-parser.ipynb
+++ b/notebook-hybrid-parser.ipynb
--- a/parser-comparison-notebook.ipynb
+++ b/parser-comparison-notebook.ipynb