Clean up notebooks, address review comments
* Improve explanations of network, hybrid, and lattice parsers * Remove dead code from parser comparison notebook * Clean-up notebook variables to reduce size and make diffs cleaner * Revert changes that were peripheral to the core changespull/153/head
parent
71805f9333
commit
42f8321c8c
|
|
@ -1,10 +1,5 @@
|
|||
version = 1
|
||||
|
||||
test_patterns = [
|
||||
"tests/**",
|
||||
"test_*.py"
|
||||
]
|
||||
|
||||
exclude_patterns = [
|
||||
"camelot/ext/**"
|
||||
]
|
||||
|
|
|
|||
|
|
@ -4,10 +4,8 @@ __pycache__/
|
|||
|
||||
build/
|
||||
dist/
|
||||
prof/
|
||||
*.egg-info/
|
||||
.eggs/
|
||||
.tox/
|
||||
.coverage
|
||||
coverage.xml
|
||||
|
||||
|
|
@ -19,5 +17,3 @@ htmlcov/
|
|||
|
||||
# vscode
|
||||
.vscode
|
||||
|
||||
.DS_Store
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
sudo: true
|
||||
language: python
|
||||
cache: pip
|
||||
addons:
|
||||
|
|
|
|||
|
|
@ -4,11 +4,8 @@ import cv2
|
|||
import numpy as np
|
||||
|
||||
|
||||
def adaptive_threshold(
|
||||
imagename, process_background=False,
|
||||
blocksize=15, c=-2):
|
||||
def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
|
||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
imagename : string
|
||||
|
|
@ -18,31 +15,24 @@ def adaptive_threshold(
|
|||
blocksize : int, optional (default: 15)
|
||||
Size of a pixel neighborhood that is used to calculate a
|
||||
threshold value for the pixel: 3, 5, 7, and so on.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
c : int, optional (default: -2)
|
||||
Constant subtracted from the mean or weighted mean.
|
||||
Normally, it is positive but may be zero or negative as well.
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_. # noqa
|
||||
|
||||
For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
|
||||
Returns
|
||||
-------
|
||||
img : object
|
||||
numpy.ndarray representing the original image.
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
"""
|
||||
img = cv2.imread(imagename)
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
if process_background:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
gray,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, blocksize, c
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, c
|
||||
)
|
||||
else:
|
||||
threshold = cv2.adaptiveThreshold(
|
||||
|
|
@ -57,12 +47,10 @@ def adaptive_threshold(
|
|||
|
||||
|
||||
def find_lines(
|
||||
threshold, regions=None,
|
||||
direction="horizontal", line_scale=15, iterations=0
|
||||
threshold, regions=None, direction="horizontal", line_scale=15, iterations=0
|
||||
):
|
||||
"""Finds horizontal and vertical lines by applying morphological
|
||||
transformations on an image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : object
|
||||
|
|
@ -76,14 +64,11 @@ def find_lines(
|
|||
line_scale : int, optional (default: 15)
|
||||
Factor by which the page dimensions will be divided to get
|
||||
smallest length of lines that should be detected.
|
||||
|
||||
The larger this value, smaller the detected lines. Making it
|
||||
too large will lead to text being detected as lines.
|
||||
iterations : int, optional (default: 0)
|
||||
Number of times for erosion/dilation is applied.
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_. # noqa
|
||||
|
||||
For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
|
||||
Returns
|
||||
-------
|
||||
dmask : object
|
||||
|
|
@ -93,7 +78,6 @@ def find_lines(
|
|||
List of tuples representing vertical/horizontal lines with
|
||||
coordinates relative to a left-top origin in
|
||||
image coordinate space.
|
||||
|
||||
"""
|
||||
lines = []
|
||||
|
||||
|
|
@ -104,9 +88,7 @@ def find_lines(
|
|||
size = threshold.shape[1] // line_scale
|
||||
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
elif direction is None:
|
||||
raise ValueError(
|
||||
"Specify direction as either 'vertical' or 'horizontal'"
|
||||
)
|
||||
raise ValueError("Specify direction as either 'vertical' or 'horizontal'")
|
||||
|
||||
if regions is not None:
|
||||
region_mask = np.zeros(threshold.shape)
|
||||
|
|
@ -121,14 +103,12 @@ def find_lines(
|
|||
|
||||
try:
|
||||
_, contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
except ValueError:
|
||||
# for opencv backward compatibility
|
||||
contours, _ = cv2.findContours(
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE
|
||||
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
for c in contours:
|
||||
|
|
@ -145,21 +125,18 @@ def find_lines(
|
|||
|
||||
def find_contours(vertical, horizontal):
|
||||
"""Finds table boundaries using OpenCV's findContours.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vertical : object
|
||||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cont : list
|
||||
List of tuples representing table boundaries. Each tuple is of
|
||||
the form (x, y, w, h) where (x, y) -> left-top, w -> width and
|
||||
h -> height in image coordinate space.
|
||||
|
||||
"""
|
||||
mask = vertical + horizontal
|
||||
|
||||
|
|
@ -185,7 +162,6 @@ def find_contours(vertical, horizontal):
|
|||
|
||||
def find_joints(contours, vertical, horizontal):
|
||||
"""Finds joints/intersections present inside each table boundary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
contours : list
|
||||
|
|
@ -196,7 +172,6 @@ def find_joints(contours, vertical, horizontal):
|
|||
numpy.ndarray representing pixels where vertical lines lie.
|
||||
horizontal : object
|
||||
numpy.ndarray representing pixels where horizontal lines lie.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tables : dict
|
||||
|
|
@ -204,7 +179,6 @@ def find_joints(contours, vertical, horizontal):
|
|||
in that boundary as their value.
|
||||
Keys are of the form (x1, y1, x2, y2) where (x1, y1) -> lb
|
||||
and (x2, y2) -> rt in image coordinate space.
|
||||
|
||||
"""
|
||||
joints = np.multiply(vertical, horizontal)
|
||||
tables = {}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue