Remove ocr

2018-09-01 16:23:54 +05:30 · 2018-09-01 16:23:54 +05:30 · 72c42c74db
parent 9753889ea2
commit 72c42c74db
3 changed files with 1 additions and 518 deletions
--- a/camelot/init.py
+++ b/camelot/init.py
@ -1,3 +1,3 @@
 __version__ = '1.2.0'

-__all__ = ['pdf', 'lattice', 'stream', 'ocr']
+__all__ = ['pdf', 'lattice', 'stream']
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -1,331 +0,0 @@
-import os
-import copy
-import logging
-import subprocess
-
-import pyocr
-from PIL import Image
-
-from .table import Table
-from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
-                      find_table_joints, remove_lines, find_cuts)
-from .utils import merge_close_values, encode_list
-
-
-__all__ = ['OCRLattice', 'OCRStream']
-logger = logging.getLogger('app_logger')
-
-
-class OCRLattice:
-    """Lattice, but for images.
-
-    Parameters
-    ----------
-    table_area : list
-        List of strings of the form x1,y1,x2,y2 where
-        (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
-        coordinate space, denoting table areas to analyze.
-        (optional, default: None)
-
-    mtol : list
-        List of ints specifying m-tolerance parameters.
-        (optional, default: [2])
-
-    blocksize : int
-        Size of a pixel neighborhood that is used to calculate a
-        threshold value for the pixel: 3, 5, 7, and so on.
-        (optional, default: 15)
-
-    threshold_constant : float
-        Constant subtracted from the mean or weighted mean
-        (see the details below). Normally, it is positive but may be
-        zero or negative as well.
-        (optional, default: -2)
-
-    dpi : int
-        Dots per inch.
-        (optional, default: 300)
-
-    layout : int
-        Tesseract page segmentation mode.
-        (optional, default: 7)
-
-    lang : string
-        Language to be used for OCR.
-        (optional, default: 'eng')
-
-    scale : int
-        Used to divide the height/width of a pdf to get a structuring
-        element for image processing.
-        (optional, default: 15)
-
-    iterations : int
-        Number of iterations for dilation.
-        (optional, default: 0)
-
-    debug : string
-        {'contour', 'line', 'joint', 'table'}
-        Set to one of the above values to generate a matplotlib plot
-        of detected contours, lines, joints and the table generated.
-        (optional, default: None)
-    """
-    def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
-                 dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
-
-        self.method = 'ocrl'
-        self.table_area = table_area
-        self.mtol = mtol
-        self.blocksize = blocksize
-        self.threshold_constant = threshold_constant
-        self.tool = pyocr.get_available_tools()[0] # fix this
-        self.dpi = dpi
-        self.layout = layout
-        self.lang = lang
-        self.scale = scale
-        self.iterations = iterations
-        self.debug = debug
-
-    def get_tables(self, pdfname):
-        if self.tool is None:
-            return None
-
-        bname, __ = os.path.splitext(pdfname)
-        imagename = ''.join([bname, '.png'])
-        logger.info('Processing {0}.'.format(os.path.basename(bname)))
-
-        gs_call = [
-            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
-            pdfname
-        ]
-        if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
-            gs_call.insert(0, "gs")
-        else:
-            gs_call.insert(0, "gsc")
-        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
-            stderr=subprocess.STDOUT)
-
-        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
-            c=self.threshold_constant)
-        vmask, v_segments = find_lines(threshold, direction='vertical',
-            scale=self.scale, iterations=self.iterations)
-        hmask, h_segments = find_lines(threshold, direction='horizontal',
-            scale=self.scale, iterations=self.iterations)
-
-        if self.table_area is not None:
-            areas = []
-            for area in self.table_area:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = int(float(x1))
-                y1 = int(float(y1))
-                x2 = int(float(x2))
-                y2 = int(float(y2))
-                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
-            table_bbox = find_table_joints(areas, vmask, hmask)
-        else:
-            contours = find_table_contours(vmask, hmask)
-            table_bbox = find_table_joints(contours, vmask, hmask)
-
-        if self.debug:
-            self.debug_images = (img, table_bbox)
-            self.debug_segments = (v_segments, h_segments)
-            self.debug_tables = []
-
-        if len(self.mtol) == 1 and self.mtol[0] == 2:
-            mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
-        else:
-            mtolerance = copy.deepcopy(self.mtol)
-
-        page = {}
-        tables = {}
-        table_no = 0
-        for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
-            table_data = {}
-            cols, rows = zip(*table_bbox[k])
-            cols, rows = list(cols), list(rows)
-            cols.extend([k[0], k[2]])
-            rows.extend([k[1], k[3]])
-            cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
-            rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
-            cols = [(cols[i], cols[i + 1])
-                    for i in range(0, len(cols) - 1)]
-            rows = [(rows[i], rows[i + 1])
-                    for i in range(0, len(rows) - 1)]
-            table = Table(cols, rows)
-            if self.debug:
-                self.debug_tables.append(table)
-            table.image = img[k[3]:k[1],k[0]:k[2]]
-            for i in range(len(table.cells)):
-                for j in range(len(table.cells[i])):
-                    x1 = int(table.cells[i][j].x1)
-                    y1 = int(table.cells[i][j].y1)
-                    x2 = int(table.cells[i][j].x2)
-                    y2 = int(table.cells[i][j].y2)
-                    table.cells[i][j].image = img[y1:y2,x1:x2]
-                    text = self.tool.image_to_string(
-                        Image.fromarray(table.cells[i][j].image),
-                        lang=self.lang,
-                        builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
-                    )
-                    table.cells[i][j].add_text(text)
-            ar = table.get_list()
-            ar.reverse()
-            ar = encode_list(ar)
-            table_data['data'] = ar
-            tables['table-{0}'.format(table_no + 1)] = table_data
-            table_no += 1
-        page[os.path.basename(bname)] = tables
-
-        if self.debug:
-            return None
-
-        return page
-
-
-class OCRStream:
-    """Stream, but for images.
-
-    Parameters
-    ----------
-    table_area : list
-        List of strings of the form x1,y1,x2,y2 where
-        (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
-        coordinate space, denoting table areas to analyze.
-        (optional, default: None)
-
-    columns : list
-        List of strings where each string is comma-separated values of
-        x-coordinates in OpenCV's coordinate space.
-        (optional, default: None)
-
-    blocksize : int
-        Size of a pixel neighborhood that is used to calculate a
-        threshold value for the pixel: 3, 5, 7, and so on.
-        (optional, default: 15)
-
-    threshold_constant : float
-        Constant subtracted from the mean or weighted mean
-        (see the details below). Normally, it is positive but may be
-        zero or negative as well.
-        (optional, default: -2)
-
-    dpi : int
-        Dots per inch.
-        (optional, default: 300)
-
-    layout : int
-        Tesseract page segmentation mode.
-        (optional, default: 7)
-
-    lang : string
-        Language to be used for OCR.
-        (optional, default: 'eng')
-
-    line_scale : int
-        Line scaling factor.
-        (optional, default: 15)
-
-    char_scale : int
-        Char scaling factor.
-        (optional, default: 200)
-    """
-    def __init__(self, table_area=None, columns=None, blocksize=15,
-                 threshold_constant=-2, dpi=300, layout=7, lang="eng",
-                 line_scale=15, char_scale=200, debug=False):
-
-        self.method = 'ocrs'
-        self.table_area = table_area
-        self.columns = columns
-        self.blocksize = blocksize
-        self.threshold_constant = threshold_constant
-        self.tool = pyocr.get_available_tools()[0] # fix this
-        self.dpi = dpi
-        self.layout = layout
-        self.lang = lang
-        self.line_scale = line_scale
-        self.char_scale = char_scale
-        self.debug = debug
-
-    def get_tables(self, pdfname):
-        if self.tool is None:
-            return None
-
-        bname, __ = os.path.splitext(pdfname)
-        imagename = ''.join([bname, '.png'])
-        logger.info('Processing {0}.'.format(os.path.basename(bname)))
-
-        gs_call = [
-            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
-            pdfname
-        ]
-        if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
-            gs_call.insert(0, "gs")
-        else:
-            gs_call.insert(0, "gsc")
-        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
-            stderr=subprocess.STDOUT)
-
-        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
-            c=self.threshold_constant)
-        threshold = remove_lines(threshold, line_scale=self.line_scale)
-        height, width = threshold.shape
-        if self.debug:
-            self.debug_images = img
-            return None
-
-        if self.table_area is not None:
-            if self.columns is not None:
-                if len(self.table_area) != len(self.columns):
-                    raise ValueError("{0}: Length of table area and columns"
-                                     " should be equal.".format(os.path.basename(bname)))
-
-            table_bbox = {}
-            for area in self.table_area:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = int(float(x1))
-                y1 = int(float(y1))
-                x2 = int(float(x2))
-                y2 = int(float(y2))
-                table_bbox[(x1, y1, x2, y2)] = None
-        else:
-            table_bbox = {(0, 0, width, height): None}
-
-        page = {}
-        tables = {}
-        table_no = 0
-        for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
-            if self.columns is None:
-                raise NotImplementedError
-            else:
-                table_data = {}
-                table_image = threshold[k[1]:k[3],k[0]:k[2]]
-                cols = self.columns[table_no].split(',')
-                cols = [float(c) for c in cols]
-                cols.insert(0, k[0])
-                cols.append(k[2])
-                cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
-                y_cuts = find_cuts(table_image, char_scale=self.char_scale)
-                rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
-                table = Table(cols, rows)
-                for i in range(len(table.cells)):
-                    for j in range(len(table.cells[i])):
-                        x1 = int(table.cells[i][j].x1)
-                        y1 = int(table.cells[i][j].y1)
-                        x2 = int(table.cells[i][j].x2)
-                        y2 = int(table.cells[i][j].y2)
-                        table.cells[i][j].image = table_image[y1:y2,x1:x2]
-                        cell_image = Image.fromarray(table.cells[i][j].image)
-                        text = self.tool.image_to_string(
-                            cell_image,
-                            lang=self.lang,
-                            builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
-                        )
-                        table.cells[i][j].add_text(text)
-                ar = table.get_list()
-                ar.reverse()
-                ar = encode_list(ar)
-                table_data['data'] = ar
-                tables['table-{0}'.format(table_no + 1)] = table_data
-                table_no += 1
-        page[os.path.basename(bname)] = tables
-
-        return page
--- a/tools/camelot
+++ b/tools/camelot
@ -18,7 +18,6 @@ from PyPDF2 import PdfFileReader
 from camelot.pdf import Pdf
 from camelot.lattice import Lattice
 from camelot.stream import Stream
-from camelot.ocr import OCRLattice, OCRStream
 from camelot import utils


@ -54,8 +53,6 @@ options:
 camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
- ocrl     Lattice, but for images.
- ocrs     Stream, but for images.

 See 'camelot <method> -h' for more information on a specific method.
 """
@ -107,51 +104,6 @@ options:
 """


-ocrl_doc = """
-Lattice, but for images.
-
-usage:
- camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
-
-options:
- -t, --tarea <tarea>            Specific table areas to analyze.
- -m, --mtol <mtol>              Tolerance to account for when merging lines
-                                which are very close. [default: 2]
- -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
- -C, --constant <constant>      See adaptive threshold doc. [default: -2]
- -D, --dpi <dpi>                Dots per inch, specify image quality to be used for OCR.
-                                [default: 300]
- -g, --layout <layout>          Tesseract page segmentation mode. [default: 7]
- -l, --lang <lang>              Specify language to be used for OCR. [default: eng]
- -s, --scale <scale>            Scaling factor. Large scaling factor leads to
-                                smaller lines being detected. [default: 15]
- -I, --iterations <iterations>  Number of iterations for dilation. [default: 0]
- -d, --debug <debug>            Debug by visualizing pdf geometry.
-                                (contour,line,joint,table) Example: -d table
-"""
-
-ocrs_doc = """
-Stream, but for images.
-
-usage:
- camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
-
-options:
- -t, --tarea <tarea>                    Specific table areas to analyze.
- -c, --columns <columns>                Comma-separated list of column x-coordinates.
-                                        Example: -c 10.1,20.2,30.3
- -b, --blocksize <blocksize>            See adaptive threshold doc. [default: 15]
- -C, --constant <constant>              See adaptive threshold doc. [default: -2]
- -D, --dpi <dpi>                        Dots per inch, specify image quality to be used for OCR.
-                                        [default: 300]
- -g, --layout <layout>                  Tesseract page segmentation mode. [default: 7]
- -l, --lang <lang>                      Specify language to be used for OCR. [default: eng]
- -G, --line-scale <line_scale>          Line scaling factor. [default: 15]
- -S, --char-scale <char_scale>          Char scaling factor. [default: 200]
- -d, --debug                            Debug by visualizing image.
-"""
-
-
 def plot_table_barchart(r, c, p, pno, tno):
    row_idx = [i + 1 for i, row in enumerate(r)]
    col_idx = [i + 1 for i, col in enumerate(c)]
@ -376,10 +328,6 @@ if __name__ == '__main__':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))
-    elif args['<method>'] == 'ocrl':
-        args.update(docopt(ocrl_doc, argv=argv))
-    elif args['<method>'] == 'ocrs':
-        args.update(docopt(ocrs_doc, argv=argv))

    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
@ -551,140 +499,6 @@ if __name__ == '__main__':
        except Exception as e:
            logger.exception(e.message, exc_info=True)
            sys.exit()
-    elif args['<method>'] == 'ocrl':
-        try:
-            kwargs = {
-                'table_area': args['--tarea'] if args['--tarea'] else None,
-                'mtol': [int(m) for m in args['--mtol']],
-                'blocksize': int(args['--blocksize']),
-                'threshold_constant': float(args['--constant']),
-                'dpi': int(args['--dpi']),
-                'layout': int(args['--layout']),
-                'lang': args['--lang'],
-                'scale': int(args['--scale']),
-                'iterations': int(args['--iterations']),
-                'debug': args['--debug']
-            }
-            manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
-                          parallel=args['--parallel'])
-            data = manager.extract()
-
-            processing_time = time.time() - start_time
-            logger.info("Finished processing in " + str(processing_time) + " seconds")
-
-            if args['--plot']:
-                if args['--output']:
-                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
-                plot_type = args['--plot'].split(',')
-                if 'page' in plot_type:
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            plot_table_barchart(table['r_nempty_cells'],
-                                table['c_nempty_cells'],
-                                table['empty_p'],
-                                page_number,
-                                table_number)
-
-                if 'all' in plot_type:
-                    plot_all_barchart(data, pngname)
-
-                if 'rc' in plot_type:
-                    plot_rc_piechart(data, pngname)
-
-            if args['--print-stats']:
-                print_stats(data, processing_time)
-
-            if args['--save-stats']:
-                if args['--output']:
-                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
-                with open(scorename, 'w') as score_file:
-                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
-                                ''.join([page_number, '_', table_number]),
-                                table['nrows'],
-                                table['ncols'],
-                                table['empty_p'],
-                                table['line_p'],
-                                table['text_p'],
-                                table['score']))
-            if args['--debug']:
-                manager.debug_plot()
-        except Exception as e:
-            logger.exception(e.message, exc_info=True)
-            sys.exit()
-    elif args['<method>'] == 'ocrs':
-        try:
-            kwargs = {
-                'table_area': args['--tarea'] if args['--tarea'] else None,
-                'columns': args['--columns'] if args['--columns'] else None,
-                'blocksize': int(args['--blocksize']),
-                'threshold_constant': float(args['--constant']),
-                'dpi': int(args['--dpi']),
-                'layout': int(args['--layout']),
-                'lang': args['--lang'],
-                'line_scale': int(args['--line-scale']),
-                'char_scale': int(args['--char-scale']),
-                'debug': args['--debug']
-            }
-            manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
-                          parallel=args['--parallel'])
-            data = manager.extract()
-
-            processing_time = time.time() - start_time
-            logger.info("Finished processing in " + str(processing_time) + " seconds")
-
-            if args['--plot']:
-                if args['--output']:
-                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
-                plot_type = args['--plot'].split(',')
-                if 'page' in plot_type:
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            plot_table_barchart(table['r_nempty_cells'],
-                                table['c_nempty_cells'],
-                                table['empty_p'],
-                                page_number,
-                                table_number)
-
-                if 'all' in plot_type:
-                    plot_all_barchart(data, pngname)
-
-                if 'rc' in plot_type:
-                    plot_rc_piechart(data, pngname)
-
-            if args['--print-stats']:
-                print_stats(data, processing_time)
-
-            if args['--save-stats']:
-                if args['--output']:
-                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
-                with open(scorename, 'w') as score_file:
-                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
-                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
-                        page = data[page_number]
-                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
-                            table = page[table_number]
-                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
-                                ''.join([page_number, '_', table_number]),
-                                table['nrows'],
-                                table['ncols'],
-                                table['empty_p'],
-                                table['line_p'],
-                                table['text_p'],
-                                table['score']))
-            if args['--debug']:
-                manager.debug_plot()
-        except Exception as e:
-            logger.exception(e.message, exc_info=True)
-            sys.exit()

    if args.get('--debug') is not None and args['--debug']:
        print("See 'camelot <method> -h' for various parameters you can tweak.")