From 72c42c74db16e17c8cfcdda6959a54e0d94097f3 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Sat, 1 Sep 2018 16:23:54 +0530 Subject: [PATCH] Remove ocr --- camelot/__init__.py | 2 +- camelot/ocr.py | 331 -------------------------------------------- tools/camelot | 186 ------------------------- 3 files changed, 1 insertion(+), 518 deletions(-) delete mode 100644 camelot/ocr.py diff --git a/camelot/__init__.py b/camelot/__init__.py index 55aee51..b9b5f18 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -1,3 +1,3 @@ __version__ = '1.2.0' -__all__ = ['pdf', 'lattice', 'stream', 'ocr'] +__all__ = ['pdf', 'lattice', 'stream'] diff --git a/camelot/ocr.py b/camelot/ocr.py deleted file mode 100644 index 48d1983..0000000 --- a/camelot/ocr.py +++ /dev/null @@ -1,331 +0,0 @@ -import os -import copy -import logging -import subprocess - -import pyocr -from PIL import Image - -from .table import Table -from .imgproc import (adaptive_threshold, find_lines, find_table_contours, - find_table_joints, remove_lines, find_cuts) -from .utils import merge_close_values, encode_list - - -__all__ = ['OCRLattice', 'OCRStream'] -logger = logging.getLogger('app_logger') - - -class OCRLattice: - """Lattice, but for images. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - mtol : list - List of ints specifying m-tolerance parameters. - (optional, default: [2]) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - dpi : int - Dots per inch. - (optional, default: 300) - - layout : int - Tesseract page segmentation mode. - (optional, default: 7) - - lang : string - Language to be used for OCR. - (optional, default: 'eng') - - scale : int - Used to divide the height/width of a pdf to get a structuring - element for image processing. - (optional, default: 15) - - iterations : int - Number of iterations for dilation. - (optional, default: 0) - - debug : string - {'contour', 'line', 'joint', 'table'} - Set to one of the above values to generate a matplotlib plot - of detected contours, lines, joints and the table generated. - (optional, default: None) - """ - def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, - dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None): - - self.method = 'ocrl' - self.table_area = table_area - self.mtol = mtol - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.tool = pyocr.get_available_tools()[0] # fix this - self.dpi = dpi - self.layout = layout - self.lang = lang - self.scale = scale - self.iterations = iterations - self.debug = debug - - def get_tables(self, pdfname): - if self.tool is None: - return None - - bname, __ = os.path.splitext(pdfname) - imagename = ''.join([bname, '.png']) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - - gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), - pdfname - ] - if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): - gs_call.insert(0, "gs") - else: - gs_call.insert(0, "gsc") - subprocess.call(gs_call, stdout=open(os.devnull, 'w'), - stderr=subprocess.STDOUT) - - img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, - c=self.threshold_constant) - vmask, v_segments = find_lines(threshold, direction='vertical', - scale=self.scale, iterations=self.iterations) - hmask, h_segments = find_lines(threshold, direction='horizontal', - scale=self.scale, iterations=self.iterations) - - if self.table_area is not None: - areas = [] - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = int(float(x1)) - y1 = int(float(y1)) - x2 = int(float(x2)) - y2 = int(float(y2)) - areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) - table_bbox = find_table_joints(areas, vmask, hmask) - else: - contours = find_table_contours(vmask, hmask) - table_bbox = find_table_joints(contours, vmask, hmask) - - if self.debug: - self.debug_images = (img, table_bbox) - self.debug_segments = (v_segments, h_segments) - self.debug_tables = [] - - if len(self.mtol) == 1 and self.mtol[0] == 2: - mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) - else: - mtolerance = copy.deepcopy(self.mtol) - - page = {} - tables = {} - table_no = 0 - for k in sorted(table_bbox.keys(), key=lambda x: x[1]): - table_data = {} - cols, rows = zip(*table_bbox[k]) - cols, rows = list(cols), list(rows) - cols.extend([k[0], k[2]]) - rows.extend([k[1], k[3]]) - cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) - rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no]) - cols = [(cols[i], cols[i + 1]) - for i in range(0, len(cols) - 1)] - rows = [(rows[i], rows[i + 1]) - for i in range(0, len(rows) - 1)] - table = Table(cols, rows) - if self.debug: - self.debug_tables.append(table) - table.image = img[k[3]:k[1],k[0]:k[2]] - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - x1 = int(table.cells[i][j].x1) - y1 = int(table.cells[i][j].y1) - x2 = int(table.cells[i][j].x2) - y2 = int(table.cells[i][j].y2) - table.cells[i][j].image = img[y1:y2,x1:x2] - text = self.tool.image_to_string( - Image.fromarray(table.cells[i][j].image), - lang=self.lang, - builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) - ) - table.cells[i][j].add_text(text) - ar = table.get_list() - ar.reverse() - ar = encode_list(ar) - table_data['data'] = ar - tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 - page[os.path.basename(bname)] = tables - - if self.debug: - return None - - return page - - -class OCRStream: - """Stream, but for images. - - Parameters - ---------- - table_area : list - List of strings of the form x1,y1,x2,y2 where - (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's - coordinate space, denoting table areas to analyze. - (optional, default: None) - - columns : list - List of strings where each string is comma-separated values of - x-coordinates in OpenCV's coordinate space. - (optional, default: None) - - blocksize : int - Size of a pixel neighborhood that is used to calculate a - threshold value for the pixel: 3, 5, 7, and so on. - (optional, default: 15) - - threshold_constant : float - Constant subtracted from the mean or weighted mean - (see the details below). Normally, it is positive but may be - zero or negative as well. - (optional, default: -2) - - dpi : int - Dots per inch. - (optional, default: 300) - - layout : int - Tesseract page segmentation mode. - (optional, default: 7) - - lang : string - Language to be used for OCR. - (optional, default: 'eng') - - line_scale : int - Line scaling factor. - (optional, default: 15) - - char_scale : int - Char scaling factor. - (optional, default: 200) - """ - def __init__(self, table_area=None, columns=None, blocksize=15, - threshold_constant=-2, dpi=300, layout=7, lang="eng", - line_scale=15, char_scale=200, debug=False): - - self.method = 'ocrs' - self.table_area = table_area - self.columns = columns - self.blocksize = blocksize - self.threshold_constant = threshold_constant - self.tool = pyocr.get_available_tools()[0] # fix this - self.dpi = dpi - self.layout = layout - self.lang = lang - self.line_scale = line_scale - self.char_scale = char_scale - self.debug = debug - - def get_tables(self, pdfname): - if self.tool is None: - return None - - bname, __ = os.path.splitext(pdfname) - imagename = ''.join([bname, '.png']) - logger.info('Processing {0}.'.format(os.path.basename(bname))) - - gs_call = [ - "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), - pdfname - ] - if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): - gs_call.insert(0, "gs") - else: - gs_call.insert(0, "gsc") - subprocess.call(gs_call, stdout=open(os.devnull, 'w'), - stderr=subprocess.STDOUT) - - img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, - c=self.threshold_constant) - threshold = remove_lines(threshold, line_scale=self.line_scale) - height, width = threshold.shape - if self.debug: - self.debug_images = img - return None - - if self.table_area is not None: - if self.columns is not None: - if len(self.table_area) != len(self.columns): - raise ValueError("{0}: Length of table area and columns" - " should be equal.".format(os.path.basename(bname))) - - table_bbox = {} - for area in self.table_area: - x1, y1, x2, y2 = area.split(",") - x1 = int(float(x1)) - y1 = int(float(y1)) - x2 = int(float(x2)) - y2 = int(float(y2)) - table_bbox[(x1, y1, x2, y2)] = None - else: - table_bbox = {(0, 0, width, height): None} - - page = {} - tables = {} - table_no = 0 - for k in sorted(table_bbox.keys(), key=lambda x: x[1]): - if self.columns is None: - raise NotImplementedError - else: - table_data = {} - table_image = threshold[k[1]:k[3],k[0]:k[2]] - cols = self.columns[table_no].split(',') - cols = [float(c) for c in cols] - cols.insert(0, k[0]) - cols.append(k[2]) - cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] - y_cuts = find_cuts(table_image, char_scale=self.char_scale) - rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] - table = Table(cols, rows) - for i in range(len(table.cells)): - for j in range(len(table.cells[i])): - x1 = int(table.cells[i][j].x1) - y1 = int(table.cells[i][j].y1) - x2 = int(table.cells[i][j].x2) - y2 = int(table.cells[i][j].y2) - table.cells[i][j].image = table_image[y1:y2,x1:x2] - cell_image = Image.fromarray(table.cells[i][j].image) - text = self.tool.image_to_string( - cell_image, - lang=self.lang, - builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout) - ) - table.cells[i][j].add_text(text) - ar = table.get_list() - ar.reverse() - ar = encode_list(ar) - table_data['data'] = ar - tables['table-{0}'.format(table_no + 1)] = table_data - table_no += 1 - page[os.path.basename(bname)] = tables - - return page \ No newline at end of file diff --git a/tools/camelot b/tools/camelot index 0918fe1..61a687f 100755 --- a/tools/camelot +++ b/tools/camelot @@ -18,7 +18,6 @@ from PyPDF2 import PdfFileReader from camelot.pdf import Pdf from camelot.lattice import Lattice from camelot.stream import Stream -from camelot.ocr import OCRLattice, OCRStream from camelot import utils @@ -54,8 +53,6 @@ options: camelot methods: lattice Looks for lines between data. stream Looks for spaces between data. - ocrl Lattice, but for images. - ocrs Stream, but for images. See 'camelot -h' for more information on a specific method. """ @@ -107,51 +104,6 @@ options: """ -ocrl_doc = """ -Lattice, but for images. - -usage: - camelot ocrl [-t ...] [-m ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -m, --mtol Tolerance to account for when merging lines - which are very close. [default: 2] - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -g, --layout Tesseract page segmentation mode. [default: 7] - -l, --lang Specify language to be used for OCR. [default: eng] - -s, --scale Scaling factor. Large scaling factor leads to - smaller lines being detected. [default: 15] - -I, --iterations Number of iterations for dilation. [default: 0] - -d, --debug Debug by visualizing pdf geometry. - (contour,line,joint,table) Example: -d table -""" - -ocrs_doc = """ -Stream, but for images. - -usage: - camelot ocrs [-t ...] [-c ...] [options] [--] - -options: - -t, --tarea Specific table areas to analyze. - -c, --columns Comma-separated list of column x-coordinates. - Example: -c 10.1,20.2,30.3 - -b, --blocksize See adaptive threshold doc. [default: 15] - -C, --constant See adaptive threshold doc. [default: -2] - -D, --dpi Dots per inch, specify image quality to be used for OCR. - [default: 300] - -g, --layout Tesseract page segmentation mode. [default: 7] - -l, --lang Specify language to be used for OCR. [default: eng] - -G, --line-scale Line scaling factor. [default: 15] - -S, --char-scale Char scaling factor. [default: 200] - -d, --debug Debug by visualizing image. -""" - - def plot_table_barchart(r, c, p, pno, tno): row_idx = [i + 1 for i, row in enumerate(r)] col_idx = [i + 1 for i, col in enumerate(c)] @@ -376,10 +328,6 @@ if __name__ == '__main__': args.update(docopt(lattice_doc, argv=argv)) elif args[''] == 'stream': args.update(docopt(stream_doc, argv=argv)) - elif args[''] == 'ocrl': - args.update(docopt(ocrl_doc, argv=argv)) - elif args[''] == 'ocrs': - args.update(docopt(ocrs_doc, argv=argv)) filename = args[''] filedir = os.path.dirname(args['']) @@ -551,140 +499,6 @@ if __name__ == '__main__': except Exception as e: logger.exception(e.message, exc_info=True) sys.exit() - elif args[''] == 'ocrl': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'mtol': [int(m) for m in args['--mtol']], - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'dpi': int(args['--dpi']), - 'layout': int(args['--layout']), - 'lang': args['--lang'], - 'scale': int(args['--scale']), - 'iterations': int(args['--iterations']), - 'debug': args['--debug'] - } - manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() - elif args[''] == 'ocrs': - try: - kwargs = { - 'table_area': args['--tarea'] if args['--tarea'] else None, - 'columns': args['--columns'] if args['--columns'] else None, - 'blocksize': int(args['--blocksize']), - 'threshold_constant': float(args['--constant']), - 'dpi': int(args['--dpi']), - 'layout': int(args['--layout']), - 'lang': args['--lang'], - 'line_scale': int(args['--line-scale']), - 'char_scale': int(args['--char-scale']), - 'debug': args['--debug'] - } - manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True, - parallel=args['--parallel']) - data = manager.extract() - - processing_time = time.time() - start_time - logger.info("Finished processing in " + str(processing_time) + " seconds") - - if args['--plot']: - if args['--output']: - pngname = os.path.join(args['--output'], os.path.basename(pngname)) - plot_type = args['--plot'].split(',') - if 'page' in plot_type: - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - plot_table_barchart(table['r_nempty_cells'], - table['c_nempty_cells'], - table['empty_p'], - page_number, - table_number) - - if 'all' in plot_type: - plot_all_barchart(data, pngname) - - if 'rc' in plot_type: - plot_rc_piechart(data, pngname) - - if args['--print-stats']: - print_stats(data, processing_time) - - if args['--save-stats']: - if args['--output']: - scorename = os.path.join(args['--output'], os.path.basename(scorename)) - with open(scorename, 'w') as score_file: - score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n') - for page_number in sorted(data.keys(), key=lambda x: int(x[5:])): - page = data[page_number] - for table_number in sorted(page.keys(), key=lambda x: int(x[6:])): - table = page[table_number] - score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( - ''.join([page_number, '_', table_number]), - table['nrows'], - table['ncols'], - table['empty_p'], - table['line_p'], - table['text_p'], - table['score'])) - if args['--debug']: - manager.debug_plot() - except Exception as e: - logger.exception(e.message, exc_info=True) - sys.exit() if args.get('--debug') is not None and args['--debug']: print("See 'camelot -h' for various parameters you can tweak.")