diff --git a/camelot/imgproc.py b/camelot/imgproc.py index b0a00ba..1621bea 100644 --- a/camelot/imgproc.py +++ b/camelot/imgproc.py @@ -4,6 +4,8 @@ from operator import itemgetter import cv2 import numpy as np +from .utils import merge_tuples + def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): """Thresholds an image using OpenCV's adaptiveThreshold. @@ -199,30 +201,72 @@ def find_table_joints(contours, vertical, horizontal): return tables -def find_cuts(threshold, line_threshold=100): - """find_cuts +def remove_lines(threshold, line_scale=15): + """Removes lines from a thresholded image. Parameters ---------- threshold : object numpy.ndarray representing the thresholded image. - line_threshold : int - Maximum intensity of projections on y-axis. - (optional, default: 100) + line_scale : int + Line scaling factor. + (optional, default: 15) + + Returns + ------- + threshold : object + numpy.ndarray representing the thresholded image + with horizontal and vertical lines removed. + """ + size = threshold.shape[0] // line_scale + vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) + horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1)) + dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) + + vertical = cv2.erode(threshold, vertical_erode_el) + vertical = cv2.dilate(vertical, dilate_el) + + horizontal = cv2.erode(threshold, horizontal_erode_el) + horizontal = cv2.dilate(horizontal, dilate_el) + + threshold = np.bitwise_and(threshold, np.invert(vertical)) + threshold = np.bitwise_and(threshold, np.invert(horizontal)) + return threshold + + +def find_cuts(threshold, char_scale=200): + """Finds cuts made by text projections on y-axis. + + Parameters + ---------- + threshold : object + numpy.ndarray representing the thresholded image. + + char_scale : int + Char scaling factor. + (optional, default: 200) Returns ------- y_cuts : list List of cuts on y-axis. """ - y_proj = np.sum(threshold, axis=1) - y_proj_less = np.where(y_proj < line_threshold)[0] - ranges = [] - for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x): - group = map(itemgetter(1), g) - ranges.append((group[0], group[-1])) - y_cuts = [] - for r in ranges: - y_cuts.append((r[0] + r[1]) / 2) + size = threshold.shape[0] // char_scale + char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size)) + + threshold = cv2.erode(threshold, char_el) + threshold = cv2.dilate(threshold, char_el) + + try: + __, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + except ValueError: + contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + + contours = [cv2.boundingRect(c) for c in contours] + y_cuts = [(c[1], c[1] + c[3]) for c in contours] + y_cuts = list(merge_tuples(sorted(y_cuts))) + y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))] return sorted(y_cuts, reverse=True) \ No newline at end of file diff --git a/camelot/ocr.py b/camelot/ocr.py index 7aa2948..48d1983 100644 --- a/camelot/ocr.py +++ b/camelot/ocr.py @@ -8,7 +8,7 @@ from PIL import Image from .table import Table from .imgproc import (adaptive_threshold, find_lines, find_table_contours, - find_table_joints, find_cuts) + find_table_joints, remove_lines, find_cuts) from .utils import merge_close_values, encode_list @@ -46,6 +46,10 @@ class OCRLattice: Dots per inch. (optional, default: 300) + layout : int + Tesseract page segmentation mode. + (optional, default: 7) + lang : string Language to be used for OCR. (optional, default: 'eng') @@ -66,7 +70,7 @@ class OCRLattice: (optional, default: None) """ def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, - dpi=300, lang="eng", scale=15, iterations=0, debug=None): + dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None): self.method = 'ocrl' self.table_area = table_area @@ -75,6 +79,7 @@ class OCRLattice: self.threshold_constant = threshold_constant self.tool = pyocr.get_available_tools()[0] # fix this self.dpi = dpi + self.layout = layout self.lang = lang self.scale = scale self.iterations = iterations @@ -159,7 +164,7 @@ class OCRLattice: text = self.tool.image_to_string( Image.fromarray(table.cells[i][j].image), lang=self.lang, - builder=pyocr.builders.TextBuilder() + builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) ) table.cells[i][j].add_text(text) ar = table.get_list() @@ -203,31 +208,41 @@ class OCRStream: zero or negative as well. (optional, default: -2) - line_threshold : int - Maximum intensity of projections on y-axis. - (optional, default: 100) - dpi : int Dots per inch. (optional, default: 300) + layout : int + Tesseract page segmentation mode. + (optional, default: 7) + lang : string Language to be used for OCR. (optional, default: 'eng') + + line_scale : int + Line scaling factor. + (optional, default: 15) + + char_scale : int + Char scaling factor. + (optional, default: 200) """ def __init__(self, table_area=None, columns=None, blocksize=15, - threshold_constant=-2, line_threshold=100, dpi=300, lang="eng", - debug=False): + threshold_constant=-2, dpi=300, layout=7, lang="eng", + line_scale=15, char_scale=200, debug=False): self.method = 'ocrs' self.table_area = table_area self.columns = columns self.blocksize = blocksize self.threshold_constant = threshold_constant - self.line_threshold = line_threshold self.tool = pyocr.get_available_tools()[0] # fix this self.dpi = dpi + self.layout = layout self.lang = lang + self.line_scale = line_scale + self.char_scale = char_scale self.debug = debug def get_tables(self, pdfname): @@ -251,6 +266,7 @@ class OCRStream: img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, c=self.threshold_constant) + threshold = remove_lines(threshold, line_scale=self.line_scale) height, width = threshold.shape if self.debug: self.debug_images = img @@ -287,7 +303,7 @@ class OCRStream: cols.insert(0, k[0]) cols.append(k[2]) cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] - y_cuts = find_cuts(table_image, line_threshold=self.line_threshold) + y_cuts = find_cuts(table_image, char_scale=self.char_scale) rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] table = Table(cols, rows) for i in range(len(table.cells)): @@ -301,7 +317,7 @@ class OCRStream: text = self.tool.image_to_string( cell_image, lang=self.lang, - builder=pyocr.builders.TextBuilder() + builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) ) table.cells[i][j].add_text(text) ar = table.get_list() diff --git a/camelot/utils.py b/camelot/utils.py index e209070..3640b37 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -751,4 +751,26 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1, width = layout.bbox[2] height = layout.bbox[3] dim = (width, height) - return layout, dim \ No newline at end of file + return layout, dim + + +def merge_tuples(tuples): + """Merges a list of overlapping tuples. + + Parameters + ---------- + tuples : list + + Returns + ------- + merged : list + """ + merged = list(tuples[0]) + for s, e in tuples: + if s <= merged[1]: + merged[1] = max(merged[1], e) + else: + yield tuple(merged) + merged[0] = s + merged[1] = e + yield tuple(merged) \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index 8e5b236..5eb62bb 100755 --- a/tools/camelot +++ b/tools/camelot @@ -121,6 +121,7 @@ options: -C, --constant See adaptive threshold doc. [default: -2] -D, --dpi Dots per inch, specify image quality to be used for OCR. [default: 300] + -g, --layout Tesseract page segmentation mode. [default: 7] -l, --lang Specify language to be used for OCR. [default: eng] -s, --scale Scaling factor. Large scaling factor leads to smaller lines being detected. [default: 15] @@ -141,11 +142,12 @@ options: Example: -c 10.1,20.2,30.3 -b, --blocksize See adaptive threshold doc. [default: 15] -C, --constant See adaptive threshold doc. [default: -2] - -N, --line-threshold Maximum intensity of projections on y-axis. - [default: 100] -D, --dpi Dots per inch, specify image quality to be used for OCR. [default: 300] + -g, --layout Tesseract page segmentation mode. [default: 7] -l, --lang Specify language to be used for OCR. [default: eng] + -G, --line-scale Line scaling factor. [default: 15] + -S, --char-scale Char scaling factor. [default: 200] -d, --debug Debug by visualizing image. """ @@ -555,6 +557,7 @@ if __name__ == '__main__': 'blocksize': int(args['--blocksize']), 'threshold_constant': float(args['--constant']), 'dpi': int(args['--dpi']), + 'layout': int(args['--layout']), 'lang': args['--lang'], 'scale': int(args['--scale']), 'iterations': int(args['--iterations']), @@ -620,9 +623,11 @@ if __name__ == '__main__': 'columns': args['--columns'] if args['--columns'] else None, 'blocksize': int(args['--blocksize']), 'threshold_constant': float(args['--constant']), - 'line_threshold': int(args['--line-threshold']), 'dpi': int(args['--dpi']), + 'layout': int(args['--layout']), 'lang': args['--lang'], + 'line_scale': int(args['--line-scale']), + 'char_scale': int(args['--char-scale']), 'debug': args['--debug'] } manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,