Clean up notebooks, address review comments
* Improve explanations of network, hybrid, and lattice parsers * Remove dead code from parser comparison notebook * Clean-up notebook variables to reduce size and make diffs cleaner * Revert changes that were peripheral to the core changespull/153/head
parent
71805f9333
commit
42f8321c8c
|
|
@ -1,10 +1,5 @@
|
||||||
version = 1
|
version = 1
|
||||||
|
|
||||||
test_patterns = [
|
|
||||||
"tests/**",
|
|
||||||
"test_*.py"
|
|
||||||
]
|
|
||||||
|
|
||||||
exclude_patterns = [
|
exclude_patterns = [
|
||||||
"camelot/ext/**"
|
"camelot/ext/**"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,8 @@ __pycache__/
|
||||||
|
|
||||||
build/
|
build/
|
||||||
dist/
|
dist/
|
||||||
prof/
|
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
.eggs/
|
.eggs/
|
||||||
.tox/
|
|
||||||
.coverage
|
.coverage
|
||||||
coverage.xml
|
coverage.xml
|
||||||
|
|
||||||
|
|
@ -19,5 +17,3 @@ htmlcov/
|
||||||
|
|
||||||
# vscode
|
# vscode
|
||||||
.vscode
|
.vscode
|
||||||
|
|
||||||
.DS_Store
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
sudo: true
|
||||||
language: python
|
language: python
|
||||||
cache: pip
|
cache: pip
|
||||||
addons:
|
addons:
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,8 @@ import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(
|
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||||
imagename, process_background=False,
|
|
||||||
blocksize=15, c=-2):
|
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
imagename : string
|
imagename : string
|
||||||
|
|
@ -18,31 +15,24 @@ def adaptive_threshold(
|
||||||
blocksize : int, optional (default: 15)
|
blocksize : int, optional (default: 15)
|
||||||
Size of a pixel neighborhood that is used to calculate a
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
threshold value for the pixel: 3, 5, 7, and so on.
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
|
||||||
c : int, optional (default: -2)
|
c : int, optional (default: -2)
|
||||||
Constant subtracted from the mean or weighted mean.
|
Constant subtracted from the mean or weighted mean.
|
||||||
Normally, it is positive but may be zero or negative as well.
|
Normally, it is positive but may be zero or negative as well.
|
||||||
|
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
img : object
|
img : object
|
||||||
numpy.ndarray representing the original image.
|
numpy.ndarray representing the original image.
|
||||||
threshold : object
|
threshold : object
|
||||||
numpy.ndarray representing the thresholded image.
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
img = cv2.imread(imagename)
|
img = cv2.imread(imagename)
|
||||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
|
||||||
if process_background:
|
if process_background:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
gray,
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||||
255,
|
|
||||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
||||||
cv2.THRESH_BINARY, blocksize, c
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
threshold = cv2.adaptiveThreshold(
|
threshold = cv2.adaptiveThreshold(
|
||||||
|
|
@ -57,12 +47,10 @@ def adaptive_threshold(
|
||||||
|
|
||||||
|
|
||||||
def find_lines(
|
def find_lines(
|
||||||
threshold, regions=None,
|
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
||||||
direction="horizontal", line_scale=15, iterations=0
|
|
||||||
):
|
):
|
||||||
"""Finds horizontal and vertical lines by applying morphological
|
"""Finds horizontal and vertical lines by applying morphological
|
||||||
transformations on an image.
|
transformations on an image.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold : object
|
threshold : object
|
||||||
|
|
@ -76,14 +64,11 @@ def find_lines(
|
||||||
line_scale : int, optional (default: 15)
|
line_scale : int, optional (default: 15)
|
||||||
Factor by which the page dimensions will be divided to get
|
Factor by which the page dimensions will be divided to get
|
||||||
smallest length of lines that should be detected.
|
smallest length of lines that should be detected.
|
||||||
|
|
||||||
The larger this value, smaller the detected lines. Making it
|
The larger this value, smaller the detected lines. Making it
|
||||||
too large will lead to text being detected as lines.
|
too large will lead to text being detected as lines.
|
||||||
iterations : int, optional (default: 0)
|
iterations : int, optional (default: 0)
|
||||||
Number of times for erosion/dilation is applied.
|
Number of times for erosion/dilation is applied.
|
||||||
|
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dmask : object
|
dmask : object
|
||||||
|
|
@ -93,7 +78,6 @@ def find_lines(
|
||||||
List of tuples representing vertical/horizontal lines with
|
List of tuples representing vertical/horizontal lines with
|
||||||
coordinates relative to a left-top origin in
|
coordinates relative to a left-top origin in
|
||||||
image coordinate space.
|
image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
|
|
@ -104,15 +88,13 @@ def find_lines(
|
||||||
size = threshold.shape[1] // line_scale
|
size = threshold.shape[1] // line_scale
|
||||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
elif direction is None:
|
elif direction is None:
|
||||||
raise ValueError(
|
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
|
||||||
"Specify direction as either 'vertical' or 'horizontal'"
|
|
||||||
)
|
|
||||||
|
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
region_mask = np.zeros(threshold.shape)
|
region_mask = np.zeros(threshold.shape)
|
||||||
for region in regions:
|
for region in regions:
|
||||||
x, y, w, h = region
|
x, y, w, h = region
|
||||||
region_mask[y:y + h, x:x + w] = 1
|
region_mask[y : y + h, x : x + w] = 1
|
||||||
threshold = np.multiply(threshold, region_mask)
|
threshold = np.multiply(threshold, region_mask)
|
||||||
|
|
||||||
threshold = cv2.erode(threshold, el)
|
threshold = cv2.erode(threshold, el)
|
||||||
|
|
@ -121,14 +103,12 @@ def find_lines(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, contours, _ = cv2.findContours(
|
_, contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
cv2.CHAIN_APPROX_SIMPLE
|
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# for opencv backward compatibility
|
# for opencv backward compatibility
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(
|
||||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||||
cv2.CHAIN_APPROX_SIMPLE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for c in contours:
|
for c in contours:
|
||||||
|
|
@ -145,21 +125,18 @@ def find_lines(
|
||||||
|
|
||||||
def find_contours(vertical, horizontal):
|
def find_contours(vertical, horizontal):
|
||||||
"""Finds table boundaries using OpenCV's findContours.
|
"""Finds table boundaries using OpenCV's findContours.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
vertical : object
|
vertical : object
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
horizontal : object
|
horizontal : object
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
cont : list
|
cont : list
|
||||||
List of tuples representing table boundaries. Each tuple is of
|
List of tuples representing table boundaries. Each tuple is of
|
||||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||||
h -> height in image coordinate space.
|
h -> height in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
mask = vertical + horizontal
|
mask = vertical + horizontal
|
||||||
|
|
||||||
|
|
@ -185,7 +162,6 @@ def find_contours(vertical, horizontal):
|
||||||
|
|
||||||
def find_joints(contours, vertical, horizontal):
|
def find_joints(contours, vertical, horizontal):
|
||||||
"""Finds joints/intersections present inside each table boundary.
|
"""Finds joints/intersections present inside each table boundary.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
contours : list
|
contours : list
|
||||||
|
|
@ -196,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
|
||||||
numpy.ndarray representing pixels where vertical lines lie.
|
numpy.ndarray representing pixels where vertical lines lie.
|
||||||
horizontal : object
|
horizontal : object
|
||||||
numpy.ndarray representing pixels where horizontal lines lie.
|
numpy.ndarray representing pixels where horizontal lines lie.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
tables : dict
|
tables : dict
|
||||||
|
|
@ -204,13 +179,12 @@ def find_joints(contours, vertical, horizontal):
|
||||||
in that boundary as their value.
|
in that boundary as their value.
|
||||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||||
and (x2, y2) -> rt in image coordinate space.
|
and (x2, y2) -> rt in image coordinate space.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
joints = np.multiply(vertical, horizontal)
|
joints = np.multiply(vertical, horizontal)
|
||||||
tables = {}
|
tables = {}
|
||||||
for c in contours:
|
for c in contours:
|
||||||
x, y, w, h = c
|
x, y, w, h = c
|
||||||
roi = joints[y:y + h, x:x + w]
|
roi = joints[y : y + h, x : x + w]
|
||||||
try:
|
try:
|
||||||
__, jc, __ = cv2.findContours(
|
__, jc, __ = cv2.findContours(
|
||||||
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue