Parameterize thresholding blocksize and constant

pull/2/head
Vinayak Mehta 2017-04-10 21:15:54 +05:30
parent 8b07aa2702
commit 72233f25ce
5 changed files with 143 additions and 61 deletions

View File

@ -139,6 +139,17 @@ class Lattice:
List of ints specifying m-tolerance parameters. List of ints specifying m-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
blocksize: int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant: float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
scale : int scale : int
Used to divide the height/width of a pdf to get a structuring Used to divide the height/width of a pdf to get a structuring
element for image processing. element for image processing.
@ -177,15 +188,17 @@ class Lattice:
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
scale=15, invert=False, margins=(1.0, 0.5, 0.1), blocksize=15, threshold_constant=-2, scale=15, invert=False,
split_text=False, flag_size=True, shift_text=['l', 't'], margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
debug=None): shift_text=['l', 't'], debug=None):
self.method = 'lattice' self.method = 'lattice'
self.table_area = table_area self.table_area = table_area
self.fill = fill self.fill = fill
self.headers = headers self.headers = headers
self.mtol = mtol self.mtol = mtol
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.scale = scale self.scale = scale
self.invert = invert self.invert = invert
self.char_margin, self.line_margin, self.word_margin = margins self.char_margin, self.line_margin, self.word_margin = margins
@ -230,7 +243,8 @@ class Lattice:
subprocess.call(gs_call, stdout=open(os.devnull, 'w'), subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename, invert=self.invert) img, threshold = adaptive_threshold(imagename, invert=self.invert,
blocksize=self.blocksize, c=self.threshold_constant)
pdf_x = width pdf_x = width
pdf_y = height pdf_y = height
img_x = img.shape[1] img_x = img.shape[1]

View File

@ -27,6 +27,17 @@ class OCR:
List of ints specifying m-tolerance parameters. List of ints specifying m-tolerance parameters.
(optional, default: [2]) (optional, default: [2])
blocksize: int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant: float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
dpi : int dpi : int
Dots per inch. Dots per inch.
(optional, default: 300) (optional, default: 300)
@ -46,12 +57,14 @@ class OCR:
of detected contours, lines, joints and the table generated. of detected contours, lines, joints and the table generated.
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15, def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
debug=None): dpi=300, lang="eng", scale=15, debug=None):
self.method = 'ocr' self.method = 'ocr'
self.table_area = table_area self.table_area = table_area
self.mtol = mtol self.mtol = mtol
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.tool = pyocr.get_available_tools()[0] # fix this self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi self.dpi = dpi
self.lang = lang self.lang = lang
@ -75,7 +88,8 @@ class OCR:
subprocess.call(gs_call, stdout=open(os.devnull, 'w'), subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename) img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
vmask, v_segments = find_lines(threshold, direction='vertical', vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale) scale=self.scale)
hmask, h_segments = find_lines(threshold, direction='horizontal', hmask, h_segments = find_lines(threshold, direction='horizontal',

View File

@ -570,14 +570,17 @@ def get_score(error_weights):
score : float score : float
""" """
SCORE_VAL = 100 SCORE_VAL = 100
score = 0 try:
if sum([ew[0] for ew in error_weights]) != SCORE_VAL: score = 0
raise ValueError("Please assign a valid weightage to each parameter" if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
" such that their sum is equal to 100") raise ValueError("Please assign a valid weightage to each parameter"
for ew in error_weights: " such that their sum is equal to 100")
weight = ew[0] / len(ew[1]) for ew in error_weights:
for error_percentage in ew[1]: weight = ew[0] / len(ew[1])
score += weight * (1 - error_percentage) for error_percentage in ew[1]:
score += weight * (1 - error_percentage)
except ZeroDivisionError:
score = 0
return score return score

41
debug/threshold.py 100644
View File

@ -0,0 +1,41 @@
"""
usage: python threshold.py file.png blocksize threshold_constant
shows thresholded image.
"""
import sys
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
def timeit(func):
def timed(*args, **kw):
start = time.time()
result = func(*args, **kw)
end = time.time()
print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
return result
return timed
@timeit
def main():
img = cv2.imread(sys.argv[1])
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blocksize = int(sys.argv[2])
threshold_constant = float(sys.argv[3])
threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant)
plt.imshow(img)
plt.show()
if __name__ == '__main__':
if len(sys.argv) == 1:
print __doc__
else:
main()

View File

@ -71,6 +71,8 @@ options:
-H, --header <header> Specify header for each table. -H, --header <header> Specify header for each table.
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-c, --constant <constant> See adaptive threshold doc. [default: -2]
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
-i, --invert Invert pdf image to make sure that lines are -i, --invert Invert pdf image to make sure that lines are
@ -109,16 +111,18 @@ usage:
camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file> camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
options: options:
-t, --tarea <tarea> Specific table areas to analyze. -t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines -m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2] which are very close. [default: 2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR. -b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
[default: 300] -c, --constant <constant> See adaptive threshold doc. [default: -2]
-l, --lang <lang> Specify language to be used for OCR. [default: eng] -D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
-s, --scale <scale> Scaling factor. Large scaling factor leads to [default: 300]
smaller lines being detected. [default: 15] -l, --lang <lang> Specify language to be used for OCR. [default: eng]
-d, --debug <debug> Debug by visualizing pdf geometry. -s, --scale <scale> Scaling factor. Large scaling factor leads to
(contour,line,joint,table) Example: -d table smaller lines being detected. [default: 15]
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
""" """
@ -374,20 +378,23 @@ if __name__ == '__main__':
float(args['--wmargin'])) float(args['--wmargin']))
if args['<method>'] == 'lattice': if args['<method>'] == 'lattice':
try: try:
tarea = args['--tarea'] if args['--tarea'] else None kwargs = {
fill = args['--fill'] if args['--fill'] else None 'table_area': args['--tarea'] if args['--tarea'] else None,
header = args['--header'] if args['--header'] else None 'fill': args['--fill'] if args['--fill'] else None,
mtol = [int(m) for m in args['--mtol']] 'headers': args['--header'] if args['--header'] else None,
shift_text = list(args['--shift_text']) if args['--shift_text'] else ['l', 't'] 'mtol': [int(m) for m in args['--mtol']],
manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header, 'blocksize': int(args['--blocksize']),
mtol=mtol, scale=int(args['--scale']), 'threshold_constant': float(args['--constant']),
invert=args['--invert'], margins=margins, 'scale': int(args['--scale']),
split_text=args['--split_text'], flag_size=args['--flag_size'], 'invert': args['--invert'],
shift_text=shift_text, debug=args['--debug']), 'margins': margins,
filename, 'split_text': args['--split_text'],
pagenos=p, 'flag_size': args['--flag_size'],
parallel=args['--parallel'], 'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
clean=True) 'debug': args['--debug']
}
manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract() data = manager.extract()
processing_time = time.time() - start_time processing_time = time.time() - start_time
@ -441,19 +448,19 @@ if __name__ == '__main__':
sys.exit() sys.exit()
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
try: try:
tarea = args['--tarea'] if args['--tarea'] else None kwargs = {
columns = args['--columns'] if args['--columns'] else None 'table_area': args['--tarea'] if args['--tarea'] else None,
header = args['--header'] if args['--header'] else None 'columns': args['--columns'] if args['--columns'] else None,
ytol = [int(y) for y in args['--ytol']] 'headers': args['--header'] if args['--header'] else None,
mtol = [int(m) for m in args['--mtol']] 'ytol': [int(y) for y in args['--ytol']],
manager = Pdf(Stream(table_area=tarea, columns=columns, 'mtol': [int(m) for m in args['--mtol']],
headers=header, ytol=ytol, mtol=mtol, 'margins': margins,
margins=margins, split_text=args['--split_text'], 'split_text': args['--split_text'],
flag_size=args['--flag_size'], debug=args['--debug']), 'flag_size': args['--flag_size'],
filename, 'debug': args['--debug']
pagenos=p, }
parallel=args['--parallel'], manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
clean=True) parallel=args['--parallel'])
data = manager.extract() data = manager.extract()
processing_time = time.time() - start_time processing_time = time.time() - start_time
@ -506,15 +513,18 @@ if __name__ == '__main__':
sys.exit() sys.exit()
elif args['<method>'] == 'ocr': elif args['<method>'] == 'ocr':
try: try:
tarea = args['--tarea'] if args['--tarea'] else None kwargs = {
mtol = [int(m) for m in args['--mtol']] 'table_area': args['--tarea'] if args['--tarea'] else None,
manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']), 'mtol': [int(m) for m in args['--mtol']],
lang=args['--lang'], scale=int(args['--scale']), 'blocksize': int(args['--blocksize']),
debug=args['--debug']), 'threshold_constant': float(args['--constant']),
filename, 'dpi': int(args['--dpi']),
pagenos=p, 'lang': args['--lang'],
parallel=args['--parallel'], 'scale': int(args['--scale']),
clean=True) 'debug': args['--debug']
}
manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract() data = manager.extract()
processing_time = time.time() - start_time processing_time = time.time() - start_time