Parameterize thresholding blocksize and constant

2017-04-10 21:15:54 +05:30 · 2017-04-10 21:15:54 +05:30 · 72233f25ce
parent 8b07aa2702
commit 72233f25ce
5 changed files with 143 additions and 61 deletions
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -139,6 +139,17 @@ class Lattice:
        List of ints specifying m-tolerance parameters.
        (optional, default: [2])

+    blocksize: int
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+        (optional, default: 15)
+
+    threshold_constant: float
+        Constant subtracted from the mean or weighted mean
+        (see the details below). Normally, it is positive but may be
+        zero or negative as well.
+        (optional, default: -2)
+
    scale : int
        Used to divide the height/width of a pdf to get a structuring
        element for image processing.
@ -177,15 +188,17 @@ class Lattice:
        (optional, default: None)
    """
    def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
-                 scale=15, invert=False, margins=(1.0, 0.5, 0.1),
-                 split_text=False, flag_size=True, shift_text=['l', 't'],
-                 debug=None):
+                 blocksize=15, threshold_constant=-2, scale=15, invert=False,
+                 margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
+                 shift_text=['l', 't'], debug=None):

        self.method = 'lattice'
        self.table_area = table_area
        self.fill = fill
        self.headers = headers
        self.mtol = mtol
+        self.blocksize = blocksize
+        self.threshold_constant = threshold_constant
        self.scale = scale
        self.invert = invert
        self.char_margin, self.line_margin, self.word_margin = margins
@ -230,7 +243,8 @@ class Lattice:
        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
            stderr=subprocess.STDOUT)

-        img, threshold = adaptive_threshold(imagename, invert=self.invert)
+        img, threshold = adaptive_threshold(imagename, invert=self.invert,
+            blocksize=self.blocksize, c=self.threshold_constant)
        pdf_x = width
        pdf_y = height
        img_x = img.shape[1]
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -27,6 +27,17 @@ class OCR:
        List of ints specifying m-tolerance parameters.
        (optional, default: [2])

+    blocksize: int
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+        (optional, default: 15)
+
+    threshold_constant: float
+        Constant subtracted from the mean or weighted mean
+        (see the details below). Normally, it is positive but may be
+        zero or negative as well.
+        (optional, default: -2)
+
    dpi : int
        Dots per inch.
        (optional, default: 300)
@ -46,12 +57,14 @@ class OCR:
        of detected contours, lines, joints and the table generated.
        (optional, default: None)
    """
-    def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15,
-                 debug=None):
+    def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
+                 dpi=300, lang="eng", scale=15, debug=None):

        self.method = 'ocr'
        self.table_area = table_area
        self.mtol = mtol
+        self.blocksize = blocksize
+        self.threshold_constant = threshold_constant
        self.tool = pyocr.get_available_tools()[0] # fix this
        self.dpi = dpi
        self.lang = lang
@ -75,7 +88,8 @@ class OCR:
        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
            stderr=subprocess.STDOUT)

-        img, threshold = adaptive_threshold(imagename)
+        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
+            c=self.threshold_constant)
        vmask, v_segments = find_lines(threshold, direction='vertical',
            scale=self.scale)
        hmask, h_segments = find_lines(threshold, direction='horizontal',
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -570,14 +570,17 @@ def get_score(error_weights):
    score : float
    """
    SCORE_VAL = 100
-    score = 0
-    if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
-        raise ValueError("Please assign a valid weightage to each parameter"
-                         " such that their sum is equal to 100")
-    for ew in error_weights:
-        weight = ew[0] / len(ew[1])
-        for error_percentage in ew[1]:
-            score += weight * (1 - error_percentage)
+    try:
+        score = 0
+        if sum([ew[0] for ew in error_weights]) != SCORE_VAL:
+            raise ValueError("Please assign a valid weightage to each parameter"
+                             " such that their sum is equal to 100")
+        for ew in error_weights:
+            weight = ew[0] / len(ew[1])
+            for error_percentage in ew[1]:
+                score += weight * (1 - error_percentage)
+    except ZeroDivisionError:
+        score = 0
    return score


--- a/debug/threshold.py
+++ b/debug/threshold.py
@ -0,0 +1,41 @@
+"""
+usage: python threshold.py file.png blocksize threshold_constant
+
+shows thresholded image.
+"""
+
+import sys
+import time
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def timeit(func):
+    def timed(*args, **kw):
+        start = time.time()
+        result = func(*args, **kw)
+        end = time.time()
+        print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start)
+        return result
+    return timed
+
+
+@timeit
+def main():
+    img = cv2.imread(sys.argv[1])
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    blocksize = int(sys.argv[2])
+    threshold_constant = float(sys.argv[3])
+    threshold = cv2.adaptiveThreshold(np.invert(gray), 255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant)
+    plt.imshow(img)
+    plt.show()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print __doc__
+    else:
+        main()
--- a/tools/camelot
+++ b/tools/camelot
@ -71,6 +71,8 @@ options:
 -H, --header <header>          Specify header for each table.
 -m, --mtol <mtol>              Tolerance to account for when merging lines
                                which are very close. [default: 2]
+ -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
+ -c, --constant <constant>      See adaptive threshold doc. [default: -2]
 -s, --scale <scale>            Scaling factor. Large scaling factor leads to
                                smaller lines being detected. [default: 15]
 -i, --invert                   Invert pdf image to make sure that lines are
@ -109,16 +111,18 @@ usage:
 camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>

 options:
- -t, --tarea <tarea>  Specific table areas to analyze.
- -m, --mtol <mtol>    Tolerance to account for when merging lines
-                      which are very close. [default: 2]
- -D, --dpi <dpi>      Dots per inch, specify image quality to be used for OCR.
-                      [default: 300]
- -l, --lang <lang>    Specify language to be used for OCR. [default: eng]
- -s, --scale <scale>  Scaling factor. Large scaling factor leads to
-                      smaller lines being detected. [default: 15]
- -d, --debug <debug>  Debug by visualizing pdf geometry.
-                      (contour,line,joint,table) Example: -d table
+ -t, --tarea <tarea>          Specific table areas to analyze.
+ -m, --mtol <mtol>            Tolerance to account for when merging lines
+                              which are very close. [default: 2]
+ -b, --blocksize <blocksize>  See adaptive threshold doc. [default: 15]
+ -c, --constant <constant>    See adaptive threshold doc. [default: -2]
+ -D, --dpi <dpi>              Dots per inch, specify image quality to be used for OCR.
+                              [default: 300]
+ -l, --lang <lang>            Specify language to be used for OCR. [default: eng]
+ -s, --scale <scale>          Scaling factor. Large scaling factor leads to
+                              smaller lines being detected. [default: 15]
+ -d, --debug <debug>          Debug by visualizing pdf geometry.
+                              (contour,line,joint,table) Example: -d table
 """


@ -374,20 +378,23 @@ if __name__ == '__main__':
        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
-            tarea = args['--tarea'] if args['--tarea'] else None
-            fill = args['--fill'] if args['--fill'] else None
-            header = args['--header'] if args['--header'] else None
-            mtol = [int(m) for m in args['--mtol']]
-            shift_text = list(args['--shift_text']) if args['--shift_text'] else ['l', 't']
-            manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header,
-                                  mtol=mtol, scale=int(args['--scale']),
-                                  invert=args['--invert'], margins=margins,
-                                  split_text=args['--split_text'], flag_size=args['--flag_size'],
-                                  shift_text=shift_text, debug=args['--debug']),
-                          filename,
-                          pagenos=p,
-                          parallel=args['--parallel'],
-                          clean=True)
+            kwargs = {
+                'table_area': args['--tarea'] if args['--tarea'] else None,
+                'fill': args['--fill'] if args['--fill'] else None,
+                'headers': args['--header'] if args['--header'] else None,
+                'mtol': [int(m) for m in args['--mtol']],
+                'blocksize': int(args['--blocksize']),
+                'threshold_constant': float(args['--constant']),
+                'scale': int(args['--scale']),
+                'invert': args['--invert'],
+                'margins': margins,
+                'split_text': args['--split_text'],
+                'flag_size': args['--flag_size'],
+                'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'],
+                'debug': args['--debug']
+            }
+            manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True,
+                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time
@ -441,19 +448,19 @@ if __name__ == '__main__':
            sys.exit()
    elif args['<method>'] == 'stream':
        try:
-            tarea = args['--tarea'] if args['--tarea'] else None
-            columns = args['--columns'] if args['--columns'] else None
-            header = args['--header'] if args['--header'] else None
-            ytol = [int(y) for y in args['--ytol']]
-            mtol = [int(m) for m in args['--mtol']]
-            manager = Pdf(Stream(table_area=tarea, columns=columns,
-                                 headers=header, ytol=ytol, mtol=mtol,
-                                 margins=margins, split_text=args['--split_text'],
-                                 flag_size=args['--flag_size'], debug=args['--debug']),
-                          filename,
-                          pagenos=p,
-                          parallel=args['--parallel'],
-                          clean=True)
+            kwargs = {
+                'table_area': args['--tarea'] if args['--tarea'] else None,
+                'columns': args['--columns'] if args['--columns'] else None,
+                'headers': args['--header'] if args['--header'] else None,
+                'ytol': [int(y) for y in args['--ytol']],
+                'mtol': [int(m) for m in args['--mtol']],
+                'margins': margins,
+                'split_text': args['--split_text'],
+                'flag_size': args['--flag_size'],
+                'debug': args['--debug']
+            }
+            manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True,
+                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time
@ -506,15 +513,18 @@ if __name__ == '__main__':
            sys.exit()
    elif args['<method>'] == 'ocr':
        try:
-            tarea = args['--tarea'] if args['--tarea'] else None
-            mtol = [int(m) for m in args['--mtol']]
-            manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']),
-                                  lang=args['--lang'], scale=int(args['--scale']),
-                                  debug=args['--debug']),
-                          filename,
-                          pagenos=p,
-                          parallel=args['--parallel'],
-                          clean=True)
+            kwargs = {
+                'table_area': args['--tarea'] if args['--tarea'] else None,
+                'mtol': [int(m) for m in args['--mtol']],
+                'blocksize': int(args['--blocksize']),
+                'threshold_constant': float(args['--constant']),
+                'dpi': int(args['--dpi']),
+                'lang': args['--lang'],
+                'scale': int(args['--scale']),
+                'debug': args['--debug']
+            }
+            manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
+                          parallel=args['--parallel'])
            data = manager.extract()

            processing_time = time.time() - start_time