diff --git a/camelot/lattice.py b/camelot/lattice.py index 45d15ee..2e82222 100644 --- a/camelot/lattice.py +++ b/camelot/lattice.py @@ -139,6 +139,17 @@ class Lattice: List of ints specifying m-tolerance parameters. (optional, default: [2]) + blocksize: int + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + (optional, default: 15) + + threshold_constant: float + Constant subtracted from the mean or weighted mean + (see the details below). Normally, it is positive but may be + zero or negative as well. + (optional, default: -2) + scale : int Used to divide the height/width of a pdf to get a structuring element for image processing. @@ -177,15 +188,17 @@ class Lattice: (optional, default: None) """ def __init__(self, table_area=None, fill=None, headers=None, mtol=[2], - scale=15, invert=False, margins=(1.0, 0.5, 0.1), - split_text=False, flag_size=True, shift_text=['l', 't'], - debug=None): + blocksize=15, threshold_constant=-2, scale=15, invert=False, + margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True, + shift_text=['l', 't'], debug=None): self.method = 'lattice' self.table_area = table_area self.fill = fill self.headers = headers self.mtol = mtol + self.blocksize = blocksize + self.threshold_constant = threshold_constant self.scale = scale self.invert = invert self.char_margin, self.line_margin, self.word_margin = margins @@ -230,7 +243,8 @@ class Lattice: subprocess.call(gs_call, stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) - img, threshold = adaptive_threshold(imagename, invert=self.invert) + img, threshold = adaptive_threshold(imagename, invert=self.invert, + blocksize=self.blocksize, c=self.threshold_constant) pdf_x = width pdf_y = height img_x = img.shape[1] diff --git a/camelot/ocr.py b/camelot/ocr.py index 16c6631..6d56cdf 100644 --- a/camelot/ocr.py +++ b/camelot/ocr.py @@ -27,6 +27,17 @@ class OCR: List of ints specifying m-tolerance parameters. (optional, default: [2]) + blocksize: int + Size of a pixel neighborhood that is used to calculate a + threshold value for the pixel: 3, 5, 7, and so on. + (optional, default: 15) + + threshold_constant: float + Constant subtracted from the mean or weighted mean + (see the details below). Normally, it is positive but may be + zero or negative as well. + (optional, default: -2) + dpi : int Dots per inch. (optional, default: 300) @@ -46,12 +57,14 @@ class OCR: of detected contours, lines, joints and the table generated. (optional, default: None) """ - def __init__(self, table_area=None, mtol=[2], dpi=300, lang="eng", scale=15, - debug=None): + def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, + dpi=300, lang="eng", scale=15, debug=None): self.method = 'ocr' self.table_area = table_area self.mtol = mtol + self.blocksize = blocksize + self.threshold_constant = threshold_constant self.tool = pyocr.get_available_tools()[0] # fix this self.dpi = dpi self.lang = lang @@ -75,7 +88,8 @@ class OCR: subprocess.call(gs_call, stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) - img, threshold = adaptive_threshold(imagename) + img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, + c=self.threshold_constant) vmask, v_segments = find_lines(threshold, direction='vertical', scale=self.scale) hmask, h_segments = find_lines(threshold, direction='horizontal', diff --git a/camelot/utils.py b/camelot/utils.py index a1610cb..d128740 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -570,14 +570,17 @@ def get_score(error_weights): score : float """ SCORE_VAL = 100 - score = 0 - if sum([ew[0] for ew in error_weights]) != SCORE_VAL: - raise ValueError("Please assign a valid weightage to each parameter" - " such that their sum is equal to 100") - for ew in error_weights: - weight = ew[0] / len(ew[1]) - for error_percentage in ew[1]: - score += weight * (1 - error_percentage) + try: + score = 0 + if sum([ew[0] for ew in error_weights]) != SCORE_VAL: + raise ValueError("Please assign a valid weightage to each parameter" + " such that their sum is equal to 100") + for ew in error_weights: + weight = ew[0] / len(ew[1]) + for error_percentage in ew[1]: + score += weight * (1 - error_percentage) + except ZeroDivisionError: + score = 0 return score diff --git a/debug/threshold.py b/debug/threshold.py new file mode 100644 index 0000000..ea716b2 --- /dev/null +++ b/debug/threshold.py @@ -0,0 +1,41 @@ +""" +usage: python threshold.py file.png blocksize threshold_constant + +shows thresholded image. +""" + +import sys +import time + +import cv2 +import numpy as np +import matplotlib.pyplot as plt + + +def timeit(func): + def timed(*args, **kw): + start = time.time() + result = func(*args, **kw) + end = time.time() + print 'Function: %r took: %2.4f seconds' % (func.__name__, end - start) + return result + return timed + + +@timeit +def main(): + img = cv2.imread(sys.argv[1]) + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + blocksize = int(sys.argv[2]) + threshold_constant = float(sys.argv[3]) + threshold = cv2.adaptiveThreshold(np.invert(gray), 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blocksize, threshold_constant) + plt.imshow(img) + plt.show() + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print __doc__ + else: + main() \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index d282e96..53e3878 100755 --- a/tools/camelot +++ b/tools/camelot @@ -71,6 +71,8 @@ options: -H, --header
Specify header for each table. -m, --mtol Tolerance to account for when merging lines which are very close. [default: 2] + -b, --blocksize See adaptive threshold doc. [default: 15] + -c, --constant See adaptive threshold doc. [default: -2] -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] -i, --invert Invert pdf image to make sure that lines are @@ -109,16 +111,18 @@ usage: camelot ocr [-t ] [-m ] [options] [--] options: - -t, --tarea Specific table areas to analyze. - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -l, --lang Specify language to be used for OCR. [default: eng] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table + -t, --tarea Specific table areas to analyze. + -m, --mtol Tolerance to account for when merging lines + which are very close. [default: 2] + -b, --blocksize See adaptive threshold doc. [default: 15] + -c, --constant See adaptive threshold doc. [default: -2] + -D, --dpi Dots per inch, specify image quality to be used for OCR. + [default: 300] + -l, --lang Specify language to be used for OCR. [default: eng] + -s, --scale Scaling factor. Large scaling factor leads to + smaller lines being detected. [default: 15] + -d, --debug Debug by visualizing pdf geometry. + (contour,line,joint,table) Example: -d table """ @@ -374,20 +378,23 @@ if __name__ == '__main__': float(args['--wmargin'])) if args[''] == 'lattice': try: - tarea = args['--tarea'] if args['--tarea'] else None - fill = args['--fill'] if args['--fill'] else None - header = args['--header'] if args['--header'] else None - mtol = [int(m) for m in args['--mtol']] - shift_text = list(args['--shift_text']) if args['--shift_text'] else ['l', 't'] - manager = Pdf(Lattice(table_area=tarea, fill=fill, headers=header, - mtol=mtol, scale=int(args['--scale']), - invert=args['--invert'], margins=margins, - split_text=args['--split_text'], flag_size=args['--flag_size'], - shift_text=shift_text, debug=args['--debug']), - filename, - pagenos=p, - parallel=args['--parallel'], - clean=True) + kwargs = { + 'table_area': args['--tarea'] if args['--tarea'] else None, + 'fill': args['--fill'] if args['--fill'] else None, + 'headers': args['--header'] if args['--header'] else None, + 'mtol': [int(m) for m in args['--mtol']], + 'blocksize': int(args['--blocksize']), + 'threshold_constant': float(args['--constant']), + 'scale': int(args['--scale']), + 'invert': args['--invert'], + 'margins': margins, + 'split_text': args['--split_text'], + 'flag_size': args['--flag_size'], + 'shift_text': list(args['--shift_text']) if args['--shift_text'] else ['l', 't'], + 'debug': args['--debug'] + } + manager = Pdf(Lattice(**kwargs), filename, pagenos=p, clean=True, + parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time @@ -441,19 +448,19 @@ if __name__ == '__main__': sys.exit() elif args[''] == 'stream': try: - tarea = args['--tarea'] if args['--tarea'] else None - columns = args['--columns'] if args['--columns'] else None - header = args['--header'] if args['--header'] else None - ytol = [int(y) for y in args['--ytol']] - mtol = [int(m) for m in args['--mtol']] - manager = Pdf(Stream(table_area=tarea, columns=columns, - headers=header, ytol=ytol, mtol=mtol, - margins=margins, split_text=args['--split_text'], - flag_size=args['--flag_size'], debug=args['--debug']), - filename, - pagenos=p, - parallel=args['--parallel'], - clean=True) + kwargs = { + 'table_area': args['--tarea'] if args['--tarea'] else None, + 'columns': args['--columns'] if args['--columns'] else None, + 'headers': args['--header'] if args['--header'] else None, + 'ytol': [int(y) for y in args['--ytol']], + 'mtol': [int(m) for m in args['--mtol']], + 'margins': margins, + 'split_text': args['--split_text'], + 'flag_size': args['--flag_size'], + 'debug': args['--debug'] + } + manager = Pdf(Stream(**kwargs), filename, pagenos=p, clean=True, + parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time @@ -506,15 +513,18 @@ if __name__ == '__main__': sys.exit() elif args[''] == 'ocr': try: - tarea = args['--tarea'] if args['--tarea'] else None - mtol = [int(m) for m in args['--mtol']] - manager = Pdf(OCR(table_area=tarea, mtol=mtol, dpi=int(args['--dpi']), - lang=args['--lang'], scale=int(args['--scale']), - debug=args['--debug']), - filename, - pagenos=p, - parallel=args['--parallel'], - clean=True) + kwargs = { + 'table_area': args['--tarea'] if args['--tarea'] else None, + 'mtol': [int(m) for m in args['--mtol']], + 'blocksize': int(args['--blocksize']), + 'threshold_constant': float(args['--constant']), + 'dpi': int(args['--dpi']), + 'lang': args['--lang'], + 'scale': int(args['--scale']), + 'debug': args['--debug'] + } + manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True, + parallel=args['--parallel']) data = manager.extract() processing_time = time.time() - start_time