Clean up notebooks, address review comments

* Improve explanations of network, hybrid, and lattice parsers
* Remove dead code from parser comparison notebook
* Clean-up notebook variables to reduce size and make diffs cleaner
* Revert changes that were peripheral to the core changes
pull/153/head
Frh 2020-07-03 18:28:24 -07:00
parent 71805f9333
commit 42f8321c8c
6 changed files with 322 additions and 331 deletions

View File

@ -1,10 +1,5 @@
version = 1 version = 1
test_patterns = [
"tests/**",
"test_*.py"
]
exclude_patterns = [ exclude_patterns = [
"camelot/ext/**" "camelot/ext/**"
] ]

4
.gitignore vendored
View File

@ -4,10 +4,8 @@ __pycache__/
build/ build/
dist/ dist/
prof/
*.egg-info/ *.egg-info/
.eggs/ .eggs/
.tox/
.coverage .coverage
coverage.xml coverage.xml
@ -19,5 +17,3 @@ htmlcov/
# vscode # vscode
.vscode .vscode
.DS_Store

View File

@ -1,3 +1,4 @@
sudo: true
language: python language: python
cache: pip cache: pip
addons: addons:

View File

@ -4,11 +4,8 @@ import cv2
import numpy as np import numpy as np
def adaptive_threshold( def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
imagename, process_background=False,
blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
Parameters Parameters
---------- ----------
imagename : string imagename : string
@ -18,31 +15,24 @@ def adaptive_threshold(
blocksize : int, optional (default: 15) blocksize : int, optional (default: 15)
Size of a pixel neighborhood that is used to calculate a Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on. threshold value for the pixel: 3, 5, 7, and so on.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
c : int, optional (default: -2) c : int, optional (default: -2)
Constant subtracted from the mean or weighted mean. Constant subtracted from the mean or weighted mean.
Normally, it is positive but may be zero or negative as well. Normally, it is positive but may be zero or negative as well.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
Returns Returns
------- -------
img : object img : object
numpy.ndarray representing the original image. numpy.ndarray representing the original image.
threshold : object threshold : object
numpy.ndarray representing the thresholded image. numpy.ndarray representing the thresholded image.
""" """
img = cv2.imread(imagename) img = cv2.imread(imagename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if process_background: if process_background:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
gray, gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, blocksize, c
) )
else: else:
threshold = cv2.adaptiveThreshold( threshold = cv2.adaptiveThreshold(
@ -57,12 +47,10 @@ def adaptive_threshold(
def find_lines( def find_lines(
threshold, regions=None, threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
direction="horizontal", line_scale=15, iterations=0
): ):
"""Finds horizontal and vertical lines by applying morphological """Finds horizontal and vertical lines by applying morphological
transformations on an image. transformations on an image.
Parameters Parameters
---------- ----------
threshold : object threshold : object
@ -76,14 +64,11 @@ def find_lines(
line_scale : int, optional (default: 15) line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected. smallest length of lines that should be detected.
The larger this value, smaller the detected lines. Making it The larger this value, smaller the detected lines. Making it
too large will lead to text being detected as lines. too large will lead to text being detected as lines.
iterations : int, optional (default: 0) iterations : int, optional (default: 0)
Number of times for erosion/dilation is applied. Number of times for erosion/dilation is applied.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
Returns Returns
------- -------
dmask : object dmask : object
@ -93,7 +78,6 @@ def find_lines(
List of tuples representing vertical/horizontal lines with List of tuples representing vertical/horizontal lines with
coordinates relative to a left-top origin in coordinates relative to a left-top origin in
image coordinate space. image coordinate space.
""" """
lines = [] lines = []
@ -104,9 +88,7 @@ def find_lines(
size = threshold.shape[1] // line_scale size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None: elif direction is None:
raise ValueError( raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
"Specify direction as either 'vertical' or 'horizontal'"
)
if regions is not None: if regions is not None:
region_mask = np.zeros(threshold.shape) region_mask = np.zeros(threshold.shape)
@ -121,14 +103,12 @@ def find_lines(
try: try:
_, contours, _ = cv2.findContours( _, contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
cv2.CHAIN_APPROX_SIMPLE
) )
except ValueError: except ValueError:
# for opencv backward compatibility # for opencv backward compatibility
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
cv2.CHAIN_APPROX_SIMPLE
) )
for c in contours: for c in contours:
@ -145,21 +125,18 @@ def find_lines(
def find_contours(vertical, horizontal): def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours. """Finds table boundaries using OpenCV's findContours.
Parameters Parameters
---------- ----------
vertical : object vertical : object
numpy.ndarray representing pixels where vertical lines lie. numpy.ndarray representing pixels where vertical lines lie.
horizontal : object horizontal : object
numpy.ndarray representing pixels where horizontal lines lie. numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
cont : list cont : list
List of tuples representing table boundaries. Each tuple is of List of tuples representing table boundaries. Each tuple is of
the form (x, y, w, h) where (x, y) -> left-top, w -> width and the form (x, y, w, h) where (x, y) -> left-top, w -> width and
h -> height in image coordinate space. h -> height in image coordinate space.
""" """
mask = vertical + horizontal mask = vertical + horizontal
@ -185,7 +162,6 @@ def find_contours(vertical, horizontal):
def find_joints(contours, vertical, horizontal): def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary. """Finds joints/intersections present inside each table boundary.
Parameters Parameters
---------- ----------
contours : list contours : list
@ -196,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
numpy.ndarray representing pixels where vertical lines lie. numpy.ndarray representing pixels where vertical lines lie.
horizontal : object horizontal : object
numpy.ndarray representing pixels where horizontal lines lie. numpy.ndarray representing pixels where horizontal lines lie.
Returns Returns
------- -------
tables : dict tables : dict
@ -204,7 +179,6 @@ def find_joints(contours, vertical, horizontal):
in that boundary as their value. in that boundary as their value.
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
and (x2, y2) -> rt in image coordinate space. and (x2, y2) -> rt in image coordinate space.
""" """
joints = np.multiply(vertical, horizontal) joints = np.multiply(vertical, horizontal)
tables = {} tables = {}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long