import os import copy import subprocess import pyocr from PIL import Image from .table import Table from .imgproc import (adaptive_threshold, find_lines, find_table_contours, find_table_joints, find_cuts) from .utils import merge_close_values, encode_list, remove_empty class OCRLattice: """Lattice, but for images. Parameters ---------- table_area : list List of strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's coordinate space, denoting table areas to analyze. (optional, default: None) mtol : list List of ints specifying m-tolerance parameters. (optional, default: [2]) blocksize : int Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. (optional, default: 15) threshold_constant : float Constant subtracted from the mean or weighted mean (see the details below). Normally, it is positive but may be zero or negative as well. (optional, default: -2) dpi : int Dots per inch. (optional, default: 300) lang : string Language to be used for OCR. (optional, default: 'eng') scale : int Used to divide the height/width of a pdf to get a structuring element for image processing. (optional, default: 15) iterations : int Number of iterations for dilation. (optional, default: 0) debug : string {'contour', 'line', 'joint', 'table'} Set to one of the above values to generate a matplotlib plot of detected contours, lines, joints and the table generated. (optional, default: None) """ def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, dpi=300, lang="eng", scale=15, iterations=0, debug=None): self.method = 'ocrl' self.table_area = table_area self.mtol = mtol self.blocksize = blocksize self.threshold_constant = threshold_constant self.tool = pyocr.get_available_tools()[0] # fix this self.dpi = dpi self.lang = lang self.scale = scale self.iterations = iterations self.debug = debug def get_tables(self, pdfname): if self.tool is None: return None bname, __ = os.path.splitext(pdfname) imagename = ''.join([bname, '.png']) gs_call = [ "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), pdfname ] if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): gs_call.insert(0, "gs") else: gs_call.insert(0, "gsc") subprocess.call(gs_call, stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, c=self.threshold_constant) vmask, v_segments = find_lines(threshold, direction='vertical', scale=self.scale, iterations=self.iterations) hmask, h_segments = find_lines(threshold, direction='horizontal', scale=self.scale, iterations=self.iterations) if self.table_area is not None: areas = [] for area in self.table_area: x1, y1, x2, y2 = area.split(",") x1 = int(x1) y1 = int(y1) x2 = int(x2) y2 = int(y2) areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1))) table_bbox = find_table_joints(areas, vmask, hmask) else: contours = find_table_contours(vmask, hmask) table_bbox = find_table_joints(contours, vmask, hmask) if self.debug: self.debug_images = (img, table_bbox) self.debug_segments = (v_segments, h_segments) self.debug_tables = [] if len(self.mtol) == 1 and self.mtol[0] == 2: mtolerance = copy.deepcopy(self.mtol) * len(table_bbox) else: mtolerance = copy.deepcopy(self.mtol) page = {} tables = {} table_no = 0 for k in sorted(table_bbox.keys(), key=lambda x: x[1]): table_data = {} cols, rows = zip(*table_bbox[k]) cols, rows = list(cols), list(rows) cols.extend([k[0], k[2]]) rows.extend([k[1], k[3]]) cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no]) rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no]) cols = [(cols[i], cols[i + 1]) for i in range(0, len(cols) - 1)] rows = [(rows[i], rows[i + 1]) for i in range(0, len(rows) - 1)] table = Table(cols, rows) if self.debug: self.debug_tables.append(table) table.image = img[k[3]:k[1],k[0]:k[2]] for i in range(len(table.cells)): for j in range(len(table.cells[i])): x1 = int(table.cells[i][j].x1) y1 = int(table.cells[i][j].y1) x2 = int(table.cells[i][j].x2) y2 = int(table.cells[i][j].y2) table.cells[i][j].image = img[y1:y2,x1:x2] text = self.tool.image_to_string( Image.fromarray(table.cells[i][j].image), lang=self.lang, builder=pyocr.builders.TextBuilder() ) table.cells[i][j].add_text(text) ar = table.get_list() ar.reverse() ar = encode_list(ar) ar = remove_empty(ar) table_data['data'] = ar tables['table-{0}'.format(table_no + 1)] = table_data table_no += 1 page[os.path.basename(bname)] = tables if self.debug: return None return page class OCRStream: """Stream, but for images. Parameters ---------- table_area : list List of strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's coordinate space, denoting table areas to analyze. (optional, default: None) columns : list List of strings where each string is comma-separated values of x-coordinates in OpenCV's coordinate space. (optional, default: None) blocksize : int Size of a pixel neighborhood that is used to calculate a threshold value for the pixel: 3, 5, 7, and so on. (optional, default: 15) threshold_constant : float Constant subtracted from the mean or weighted mean (see the details below). Normally, it is positive but may be zero or negative as well. (optional, default: -2) line_threshold : int Maximum intensity of projections on y-axis. (optional, default: 100) dpi : int Dots per inch. (optional, default: 300) lang : string Language to be used for OCR. (optional, default: 'eng') """ def __init__(self, table_area=None, columns=None, blocksize=15, threshold_constant=-2, line_threshold=100, dpi=300, lang="eng", debug=False): self.method = 'ocrs' self.table_area = table_area self.columns = columns self.blocksize = blocksize self.threshold_constant = threshold_constant self.line_threshold = line_threshold self.tool = pyocr.get_available_tools()[0] # fix this self.dpi = dpi self.lang = lang self.debug = debug def get_tables(self, pdfname): if self.tool is None: return None bname, __ = os.path.splitext(pdfname) imagename = ''.join([bname, '.png']) gs_call = [ "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi), pdfname ] if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower(): gs_call.insert(0, "gs") else: gs_call.insert(0, "gsc") subprocess.call(gs_call, stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, c=self.threshold_constant) height, width = threshold.shape if self.debug: self.debug_images = img return None if self.table_area is not None: if self.columns is not None: if len(self.table_area) != len(self.columns): raise ValueError("Length of table area and columns should be equal.") table_bbox = {} for area in self.table_area: x1, y1, x2, y2 = area.split(",") x1 = int(x1) y1 = int(y1) x2 = int(x2) y2 = int(y2) table_bbox[(x1, y1, x2, y2)] = None else: table_bbox = {(0, 0, width, height): None} page = {} tables = {} table_no = 0 for k in sorted(table_bbox.keys(), key=lambda x: x[1]): if self.columns is None: raise NotImplementedError else: table_data = {} table_image = threshold[k[1]:k[3],k[0]:k[2]] cols = self.columns[table_no].split(',') cols = [float(c) for c in cols] cols.insert(0, k[0]) cols.append(k[2]) cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] y_cuts = find_cuts(table_image, line_threshold=self.line_threshold) rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] table = Table(cols, rows) for i in range(len(table.cells)): for j in range(len(table.cells[i])): x1 = int(table.cells[i][j].x1) y1 = int(table.cells[i][j].y1) x2 = int(table.cells[i][j].x2) y2 = int(table.cells[i][j].y2) table.cells[i][j].image = table_image[y1:y2,x1:x2] cell_image = Image.fromarray(table.cells[i][j].image) text = self.tool.image_to_string( cell_image, lang=self.lang, builder=pyocr.builders.TextBuilder() ) table.cells[i][j].add_text(text) ar = table.get_list() ar.reverse() ar = encode_list(ar) ar = remove_empty(ar) table_data['data'] = ar tables['table-{0}'.format(table_no + 1)] = table_data table_no += 1 page[os.path.basename(bname)] = tables return page