Remove ocr

pull/2/head
Vinayak Mehta 2018-09-01 16:23:54 +05:30
parent 9753889ea2
commit 72c42c74db
3 changed files with 1 additions and 518 deletions

View File

@ -1,3 +1,3 @@
__version__ = '1.2.0' __version__ = '1.2.0'
__all__ = ['pdf', 'lattice', 'stream', 'ocr'] __all__ = ['pdf', 'lattice', 'stream']

View File

@ -1,331 +0,0 @@
import os
import copy
import logging
import subprocess
import pyocr
from PIL import Image
from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints, remove_lines, find_cuts)
from .utils import merge_close_values, encode_list
__all__ = ['OCRLattice', 'OCRStream']
logger = logging.getLogger('app_logger')
class OCRLattice:
"""Lattice, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
mtol : list
List of ints specifying m-tolerance parameters.
(optional, default: [2])
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
dpi : int
Dots per inch.
(optional, default: 300)
layout : int
Tesseract page segmentation mode.
(optional, default: 7)
lang : string
Language to be used for OCR.
(optional, default: 'eng')
scale : int
Used to divide the height/width of a pdf to get a structuring
element for image processing.
(optional, default: 15)
iterations : int
Number of iterations for dilation.
(optional, default: 0)
debug : string
{'contour', 'line', 'joint', 'table'}
Set to one of the above values to generate a matplotlib plot
of detected contours, lines, joints and the table generated.
(optional, default: None)
"""
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
self.method = 'ocrl'
self.table_area = table_area
self.mtol = mtol
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi
self.layout = layout
self.lang = lang
self.scale = scale
self.iterations = iterations
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
logger.info('Processing {0}.'.format(os.path.basename(bname)))
gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
pdfname
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
vmask, v_segments = find_lines(threshold, direction='vertical',
scale=self.scale, iterations=self.iterations)
hmask, h_segments = find_lines(threshold, direction='horizontal',
scale=self.scale, iterations=self.iterations)
if self.table_area is not None:
areas = []
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(float(x1))
y1 = int(float(y1))
x2 = int(float(x2))
y2 = int(float(y2))
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vmask, hmask)
else:
contours = find_table_contours(vmask, hmask)
table_bbox = find_table_joints(contours, vmask, hmask)
if self.debug:
self.debug_images = (img, table_bbox)
self.debug_segments = (v_segments, h_segments)
self.debug_tables = []
if len(self.mtol) == 1 and self.mtol[0] == 2:
mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
else:
mtolerance = copy.deepcopy(self.mtol)
page = {}
tables = {}
table_no = 0
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
table_data = {}
cols, rows = zip(*table_bbox[k])
cols, rows = list(cols), list(rows)
cols.extend([k[0], k[2]])
rows.extend([k[1], k[3]])
cols = merge_close_values(sorted(cols), mtol=mtolerance[table_no])
rows = merge_close_values(sorted(rows, reverse=True), mtol=mtolerance[table_no])
cols = [(cols[i], cols[i + 1])
for i in range(0, len(cols) - 1)]
rows = [(rows[i], rows[i + 1])
for i in range(0, len(rows) - 1)]
table = Table(cols, rows)
if self.debug:
self.debug_tables.append(table)
table.image = img[k[3]:k[1],k[0]:k[2]]
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = img[y1:y2,x1:x2]
text = self.tool.image_to_string(
Image.fromarray(table.cells[i][j].image),
lang=self.lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
if self.debug:
return None
return page
class OCRStream:
"""Stream, but for images.
Parameters
----------
table_area : list
List of strings of the form x1,y1,x2,y2 where
(x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
coordinate space, denoting table areas to analyze.
(optional, default: None)
columns : list
List of strings where each string is comma-separated values of
x-coordinates in OpenCV's coordinate space.
(optional, default: None)
blocksize : int
Size of a pixel neighborhood that is used to calculate a
threshold value for the pixel: 3, 5, 7, and so on.
(optional, default: 15)
threshold_constant : float
Constant subtracted from the mean or weighted mean
(see the details below). Normally, it is positive but may be
zero or negative as well.
(optional, default: -2)
dpi : int
Dots per inch.
(optional, default: 300)
layout : int
Tesseract page segmentation mode.
(optional, default: 7)
lang : string
Language to be used for OCR.
(optional, default: 'eng')
line_scale : int
Line scaling factor.
(optional, default: 15)
char_scale : int
Char scaling factor.
(optional, default: 200)
"""
def __init__(self, table_area=None, columns=None, blocksize=15,
threshold_constant=-2, dpi=300, layout=7, lang="eng",
line_scale=15, char_scale=200, debug=False):
self.method = 'ocrs'
self.table_area = table_area
self.columns = columns
self.blocksize = blocksize
self.threshold_constant = threshold_constant
self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi
self.layout = layout
self.lang = lang
self.line_scale = line_scale
self.char_scale = char_scale
self.debug = debug
def get_tables(self, pdfname):
if self.tool is None:
return None
bname, __ = os.path.splitext(pdfname)
imagename = ''.join([bname, '.png'])
logger.info('Processing {0}.'.format(os.path.basename(bname)))
gs_call = [
"-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
pdfname
]
if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
gs_call.insert(0, "gs")
else:
gs_call.insert(0, "gsc")
subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
stderr=subprocess.STDOUT)
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant)
threshold = remove_lines(threshold, line_scale=self.line_scale)
height, width = threshold.shape
if self.debug:
self.debug_images = img
return None
if self.table_area is not None:
if self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("{0}: Length of table area and columns"
" should be equal.".format(os.path.basename(bname)))
table_bbox = {}
for area in self.table_area:
x1, y1, x2, y2 = area.split(",")
x1 = int(float(x1))
y1 = int(float(y1))
x2 = int(float(x2))
y2 = int(float(y2))
table_bbox[(x1, y1, x2, y2)] = None
else:
table_bbox = {(0, 0, width, height): None}
page = {}
tables = {}
table_no = 0
for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
if self.columns is None:
raise NotImplementedError
else:
table_data = {}
table_image = threshold[k[1]:k[3],k[0]:k[2]]
cols = self.columns[table_no].split(',')
cols = [float(c) for c in cols]
cols.insert(0, k[0])
cols.append(k[2])
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
table = Table(cols, rows)
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = table_image[y1:y2,x1:x2]
cell_image = Image.fromarray(table.cells[i][j].image)
text = self.tool.image_to_string(
cell_image,
lang=self.lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page

View File

@ -18,7 +18,6 @@ from PyPDF2 import PdfFileReader
from camelot.pdf import Pdf from camelot.pdf import Pdf
from camelot.lattice import Lattice from camelot.lattice import Lattice
from camelot.stream import Stream from camelot.stream import Stream
from camelot.ocr import OCRLattice, OCRStream
from camelot import utils from camelot import utils
@ -54,8 +53,6 @@ options:
camelot methods: camelot methods:
lattice Looks for lines between data. lattice Looks for lines between data.
stream Looks for spaces between data. stream Looks for spaces between data.
ocrl Lattice, but for images.
ocrs Stream, but for images.
See 'camelot <method> -h' for more information on a specific method. See 'camelot <method> -h' for more information on a specific method.
""" """
@ -107,51 +104,6 @@ options:
""" """
ocrl_doc = """
Lattice, but for images.
usage:
camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-m, --mtol <mtol> Tolerance to account for when merging lines
which are very close. [default: 2]
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15]
-I, --iterations <iterations> Number of iterations for dilation. [default: 0]
-d, --debug <debug> Debug by visualizing pdf geometry.
(contour,line,joint,table) Example: -d table
"""
ocrs_doc = """
Stream, but for images.
usage:
camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
options:
-t, --tarea <tarea> Specific table areas to analyze.
-c, --columns <columns> Comma-separated list of column x-coordinates.
Example: -c 10.1,20.2,30.3
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300]
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
-G, --line-scale <line_scale> Line scaling factor. [default: 15]
-S, --char-scale <char_scale> Char scaling factor. [default: 200]
-d, --debug Debug by visualizing image.
"""
def plot_table_barchart(r, c, p, pno, tno): def plot_table_barchart(r, c, p, pno, tno):
row_idx = [i + 1 for i, row in enumerate(r)] row_idx = [i + 1 for i, row in enumerate(r)]
col_idx = [i + 1 for i, col in enumerate(c)] col_idx = [i + 1 for i, col in enumerate(c)]
@ -376,10 +328,6 @@ if __name__ == '__main__':
args.update(docopt(lattice_doc, argv=argv)) args.update(docopt(lattice_doc, argv=argv))
elif args['<method>'] == 'stream': elif args['<method>'] == 'stream':
args.update(docopt(stream_doc, argv=argv)) args.update(docopt(stream_doc, argv=argv))
elif args['<method>'] == 'ocrl':
args.update(docopt(ocrl_doc, argv=argv))
elif args['<method>'] == 'ocrs':
args.update(docopt(ocrs_doc, argv=argv))
filename = args['<file>'] filename = args['<file>']
filedir = os.path.dirname(args['<file>']) filedir = os.path.dirname(args['<file>'])
@ -551,140 +499,6 @@ if __name__ == '__main__':
except Exception as e: except Exception as e:
logger.exception(e.message, exc_info=True) logger.exception(e.message, exc_info=True)
sys.exit() sys.exit()
elif args['<method>'] == 'ocrl':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'mtol': [int(m) for m in args['--mtol']],
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'dpi': int(args['--dpi']),
'layout': int(args['--layout']),
'lang': args['--lang'],
'scale': int(args['--scale']),
'iterations': int(args['--iterations']),
'debug': args['--debug']
}
manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
processing_time = time.time() - start_time
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
manager.debug_plot()
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
elif args['<method>'] == 'ocrs':
try:
kwargs = {
'table_area': args['--tarea'] if args['--tarea'] else None,
'columns': args['--columns'] if args['--columns'] else None,
'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']),
'dpi': int(args['--dpi']),
'layout': int(args['--layout']),
'lang': args['--lang'],
'line_scale': int(args['--line-scale']),
'char_scale': int(args['--char-scale']),
'debug': args['--debug']
}
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
parallel=args['--parallel'])
data = manager.extract()
processing_time = time.time() - start_time
logger.info("Finished processing in " + str(processing_time) + " seconds")
if args['--plot']:
if args['--output']:
pngname = os.path.join(args['--output'], os.path.basename(pngname))
plot_type = args['--plot'].split(',')
if 'page' in plot_type:
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
plot_table_barchart(table['r_nempty_cells'],
table['c_nempty_cells'],
table['empty_p'],
page_number,
table_number)
if 'all' in plot_type:
plot_all_barchart(data, pngname)
if 'rc' in plot_type:
plot_rc_piechart(data, pngname)
if args['--print-stats']:
print_stats(data, processing_time)
if args['--save-stats']:
if args['--output']:
scorename = os.path.join(args['--output'], os.path.basename(scorename))
with open(scorename, 'w') as score_file:
score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
page = data[page_number]
for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
table = page[table_number]
score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
''.join([page_number, '_', table_number]),
table['nrows'],
table['ncols'],
table['empty_p'],
table['line_p'],
table['text_p'],
table['score']))
if args['--debug']:
manager.debug_plot()
except Exception as e:
logger.exception(e.message, exc_info=True)
sys.exit()
if args.get('--debug') is not None and args['--debug']: if args.get('--debug') is not None and args['--debug']:
print("See 'camelot <method> -h' for various parameters you can tweak.") print("See 'camelot <method> -h' for various parameters you can tweak.")