Parameterize thresholding blocksize and constant
parent
8b07aa2702
commit
72233f25ce
|
|
@ -139,6 +139,17 @@ class Lattice:
|
||||||
List of ints specifying m-tolerance parameters.
|
List of ints specifying m-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
||||||
|
blocksize: int
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
(optional, default: 15)
|
||||||
|
|
||||||
|
threshold_constant: float
|
||||||
|
Constant subtracted from the mean or weighted mean
|
||||||
|
(see the details below). Normally, it is positive but may be
|
||||||
|
zero or negative as well.
|
||||||
|
(optional, default: -2)
|
||||||
|
|
||||||
scale : int
|
scale : int
|
||||||
Used to divide the height/width of a pdf to get a structuring
|
Used to divide the height/width of a pdf to get a structuring
|
||||||
element for image processing.
|
element for image processing.
|
||||||
|
|
@ -177,15 +188,17 @@ class Lattice:
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
|
||||||
scale=15, invert=False, margins=(1.0, 0.5, 0.1),
|
blocksize=15, threshold_constant=-2, scale=15, invert=False,
|
||||||
split_text=False, flag_size=True, shift_text=['l', 't'],
|
margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
|
||||||
debug=None):
|
shift_text=['l', 't'], debug=None):
|
||||||
|
|
||||||
self.method = 'lattice'
|
self.method = 'lattice'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.fill = fill
|
self.fill = fill
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
|
self.blocksize = blocksize
|
||||||
|
self.threshold_constant = threshold_constant
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.invert = invert
|
self.invert = invert
|
||||||
self.char_margin, self.line_margin, self.word_margin = margins
|
self.char_margin, self.line_margin, self.word_margin = margins
|
||||||
|
|
@ -230,7 +243,8 @@ class Lattice:
|
||||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||||
stderr=subprocess.STDOUT)
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
img, threshold = adaptive_threshold(imagename, invert=self.invert)
|
img, threshold = adaptive_threshold(imagename, invert=self.invert,
|
||||||
|
blocksize=self.blocksize, c=self.threshold_constant)
|
||||||
pdf_x = width
|
pdf_x = width
|
||||||
pdf_y = height
|
pdf_y = height
|
||||||
img_x = img.shape[1]
|
img_x = img.shape[1]
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,17 @@ class OCR:
|
||||||
List of ints specifying m-tolerance parameters.
|
List of ints specifying m-tolerance parameters.
|
||||||
(optional, default: [2])
|
(optional, default: [2])
|
||||||
|
|
||||||
|
blocksize: int
|
||||||
|
Size of a pixel neighborhood that is used to calculate a
|
||||||
|
threshold value for the pixel: 3, 5, 7, and so on.
|
||||||
|
(optional, default: 15)
|
||||||
|
|
||||||
|
threshold_constant: float
|
||||||
|
Constant subtracted from the mean or weighted mean
|
||||||
|
(see the details below). Normally, it is positive but may be
|
||||||
|
zero or negative as well.
|
||||||
|
(optional, default: -2)
|
||||||
|
|
||||||
dpi : int
|
dpi : int
|
||||||
Dots per inch.
|
Dots per inch.
|
||||||
(optional, default: 300)
|
(optional, default: 300)
|
||||||
|
|
@ -46,12 +57,14 @@ class OCR:
|
||||||
of detected contours, lines, joints and the table generated.
|
of detected contours, lines, joints and the table generated.
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15,
|
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
||||||
debug=None):
|
dpi=300, lang="eng", scale=15, debug=None):
|
||||||
|
|
||||||
self.method = 'ocr'
|
self.method = 'ocr'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.mtol = mtol
|
self.mtol = mtol
|
||||||
|
self.blocksize = blocksize
|
||||||
|
self.threshold_constant = threshold_constant
|
||||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||||
self.dpi = dpi
|
self.dpi = dpi
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
|
@ -75,7 +88,8 @@ class OCR:
|
||||||
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
|
||||||
stderr=subprocess.STDOUT)
|
stderr=subprocess.STDOUT)
|
||||||
|
|
||||||
img, threshold = adaptive_threshold(imagename)
|
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||||
|
c=self.threshold_constant)
|
||||||
vmask, v_segments = find_lines(threshold, direction='vertical',
|
vmask, v_segments = find_lines(threshold, direction='vertical',
|
||||||
scale=self.scale)
|
scale=self.scale)
|
||||||
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
hmask, h_segments = find_lines(threshold, direction='horizontal',
|
||||||
|
|
|
||||||
|
|
@ -570,14 +570,17 @@ def get_score(error_weights):
|
||||||
score : float
|
score : float
|
||||||
"""
|
"""
|
||||||
SCORE_VAL = 100
|
SCORE_VAL = 100
|
||||||
score = 0
|
try:
|
||||||
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
score = 0
|
||||||
raise ValueError("Please assign a valid weightage to each parameter"
|
if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
|
||||||
" such that their sum is equal to 100")
|
raise ValueError("Please assign a valid weightage to each parameter"
|
||||||
for ew in error_weights:
|
" such that their sum is equal to 100")
|
||||||
weight = ew[0] / len(ew[1])
|
for ew in error_weights:
|
||||||
for error_percentage in ew[1]:
|
weight = ew[0] / len(ew[1])
|
||||||
score += weight * (1 - error_percentage)
|
for error_percentage in ew[1]:
|
||||||
|
score += weight * (1 - error_percentage)
|
||||||
|
except ZeroDivisionError:
|
||||||
|
score = 0
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""
|
||||||
|
usage: python threshold.py file.png blocksize threshold_constant
|
||||||
|
|
||||||
|
shows thresholded image.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(func):
|
||||||
|
def timed(*args, **kw):
|
||||||
|
start = time.time()
|
||||||
|
result = func(*args, **kw)
|
||||||
|
end = time.time()
|
||||||
|
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
|
||||||
|
return result
|
||||||
|
return timed
|
||||||
|
|
||||||
|
|
||||||
|
@timeit
|
||||||
|
def main():
|
||||||
|
img = cv2.imread(sys.argv[1])
|
||||||
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||||
|
blocksize = int(sys.argv[2])
|
||||||
|
threshold_constant = float(sys.argv[3])
|
||||||
|
threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant)
|
||||||
|
plt.imshow(img)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print __doc__
|
||||||
|
else:
|
||||||
|
main()
|
||||||
102
tools/camelot
102
tools/camelot
|
|
@ -71,6 +71,8 @@ options:
|
||||||
-H, --header <header> Specify header for each table.
|
-H, --header <header> Specify header for each table.
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
|
-c, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
-i, --invert Invert pdf image to make sure that lines are
|
-i, --invert Invert pdf image to make sure that lines are
|
||||||
|
|
@ -109,16 +111,18 @@ usage:
|
||||||
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
|
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-t, --tarea <tarea> Specific table areas to analyze.
|
-t, --tarea <tarea> Specific table areas to analyze.
|
||||||
-m, --mtol <mtol> Tolerance to account for when merging lines
|
-m, --mtol <mtol> Tolerance to account for when merging lines
|
||||||
which are very close. [default: 2]
|
which are very close. [default: 2]
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
[default: 300]
|
-c, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
[default: 300]
|
||||||
smaller lines being detected. [default: 15]
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
-d, --debug <debug> Debug by visualizing pdf geometry.
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
(contour,line,joint,table) Example: -d table
|
smaller lines being detected. [default: 15]
|
||||||
|
-d, --debug <debug> Debug by visualizing pdf geometry.
|
||||||
|
(contour,line,joint,table) Example: -d table
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -374,20 +378,23 @@ if __name__ == '__main__':
|
||||||
float(args['--wmargin']))
|
float(args['--wmargin']))
|
||||||
if args['<method>'] == 'lattice':
|
if args['<method>'] == 'lattice':
|
||||||
try:
|
try:
|
||||||
tarea = args['--tarea'] if args['--tarea'] else None
|
kwargs = {
|
||||||
fill = args['--fill'] if args['--fill'] else None
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
header = args['--header'] if args['--header'] else None
|
'fill': args['--fill'] if args['--fill'] else None,
|
||||||
mtol = [int(m) for m in args['--mtol']]
|
'headers': args['--header'] if args['--header'] else None,
|
||||||
shift_text = list(args['--shift_text']) if args['--shift_text'] else ['l', 't']
|
'mtol': [int(m) for m in args['--mtol']],
|
||||||
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
|
'blocksize': int(args['--blocksize']),
|
||||||
mtol=mtol, scale=int(args['--scale']),
|
'threshold_constant': float(args['--constant']),
|
||||||
invert=args['--invert'], margins=margins,
|
'scale': int(args['--scale']),
|
||||||
split_text=args['--split_text'], flag_size=args['--flag_size'],
|
'invert': args['--invert'],
|
||||||
shift_text=shift_text, debug=args['--debug']),
|
'margins': margins,
|
||||||
filename,
|
'split_text': args['--split_text'],
|
||||||
pagenos=p,
|
'flag_size': args['--flag_size'],
|
||||||
parallel=args['--parallel'],
|
'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
|
||||||
clean=True)
|
'debug': args['--debug']
|
||||||
|
}
|
||||||
|
manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
|
||||||
|
parallel=args['--parallel'])
|
||||||
data = manager.extract()
|
data = manager.extract()
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
processing_time = time.time() - start_time
|
||||||
|
|
@ -441,19 +448,19 @@ if __name__ == '__main__':
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'stream':
|
elif args['<method>'] == 'stream':
|
||||||
try:
|
try:
|
||||||
tarea = args['--tarea'] if args['--tarea'] else None
|
kwargs = {
|
||||||
columns = args['--columns'] if args['--columns'] else None
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
header = args['--header'] if args['--header'] else None
|
'columns': args['--columns'] if args['--columns'] else None,
|
||||||
ytol = [int(y) for y in args['--ytol']]
|
'headers': args['--header'] if args['--header'] else None,
|
||||||
mtol = [int(m) for m in args['--mtol']]
|
'ytol': [int(y) for y in args['--ytol']],
|
||||||
manager = Pdf(Stream(table_area=tarea, columns=columns,
|
'mtol': [int(m) for m in args['--mtol']],
|
||||||
headers=header, ytol=ytol, mtol=mtol,
|
'margins': margins,
|
||||||
margins=margins, split_text=args['--split_text'],
|
'split_text': args['--split_text'],
|
||||||
flag_size=args['--flag_size'], debug=args['--debug']),
|
'flag_size': args['--flag_size'],
|
||||||
filename,
|
'debug': args['--debug']
|
||||||
pagenos=p,
|
}
|
||||||
parallel=args['--parallel'],
|
manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
|
||||||
clean=True)
|
parallel=args['--parallel'])
|
||||||
data = manager.extract()
|
data = manager.extract()
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
processing_time = time.time() - start_time
|
||||||
|
|
@ -506,15 +513,18 @@ if __name__ == '__main__':
|
||||||
sys.exit()
|
sys.exit()
|
||||||
elif args['<method>'] == 'ocr':
|
elif args['<method>'] == 'ocr':
|
||||||
try:
|
try:
|
||||||
tarea = args['--tarea'] if args['--tarea'] else None
|
kwargs = {
|
||||||
mtol = [int(m) for m in args['--mtol']]
|
'table_area': args['--tarea'] if args['--tarea'] else None,
|
||||||
manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']),
|
'mtol': [int(m) for m in args['--mtol']],
|
||||||
lang=args['--lang'], scale=int(args['--scale']),
|
'blocksize': int(args['--blocksize']),
|
||||||
debug=args['--debug']),
|
'threshold_constant': float(args['--constant']),
|
||||||
filename,
|
'dpi': int(args['--dpi']),
|
||||||
pagenos=p,
|
'lang': args['--lang'],
|
||||||
parallel=args['--parallel'],
|
'scale': int(args['--scale']),
|
||||||
clean=True)
|
'debug': args['--debug']
|
||||||
|
}
|
||||||
|
manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
|
||||||
|
parallel=args['--parallel'])
|
||||||
data = manager.extract()
|
data = manager.extract()
|
||||||
|
|
||||||
processing_time = time.time() - start_time
|
processing_time = time.time() - start_time
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue