Add better y-cuts detection

pull/2/head
Vinayak Mehta 2017-04-25 18:44:53 +05:30
parent 76e1d32417
commit e252e476b9
4 changed files with 117 additions and 30 deletions

View File

@ -4,6 +4,8 @@ from operator import itemgetter
import cv2 import cv2
import numpy as np import numpy as np
from .utils import merge_tuples
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2): def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
"""Thresholds an image using OpenCV's adaptiveThreshold. """Thresholds an image using OpenCV's adaptiveThreshold.
@ -199,30 +201,72 @@ def find_table_joints(contours, vertical, horizontal):
return tables return tables
def find_cuts(threshold, line_threshold=100): def remove_lines(threshold, line_scale=15):
"""find_cuts """Removes lines from a thresholded image.
Parameters Parameters
---------- ----------
threshold : object threshold : object
numpy.ndarray representing the thresholded image. numpy.ndarray representing the thresholded image.
line_threshold : int line_scale : int
Maximum intensity of projections on y-axis. Line scaling factor.
(optional, default: 100) (optional, default: 15)
Returns
-------
threshold : object
numpy.ndarray representing the thresholded image
with horizontal and vertical lines removed.
"""
size = threshold.shape[0] // line_scale
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
vertical = cv2.erode(threshold, vertical_erode_el)
vertical = cv2.dilate(vertical, dilate_el)
horizontal = cv2.erode(threshold, horizontal_erode_el)
horizontal = cv2.dilate(horizontal, dilate_el)
threshold = np.bitwise_and(threshold, np.invert(vertical))
threshold = np.bitwise_and(threshold, np.invert(horizontal))
return threshold
def find_cuts(threshold, char_scale=200):
"""Finds cuts made by text projections on y-axis.
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
char_scale : int
Char scaling factor.
(optional, default: 200)
Returns Returns
------- -------
y_cuts : list y_cuts : list
List of cuts on y-axis. List of cuts on y-axis.
""" """
y_proj = np.sum(threshold, axis=1) size = threshold.shape[0] // char_scale
y_proj_less = np.where(y_proj < line_threshold)[0] char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
ranges = []
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x): threshold = cv2.erode(threshold, char_el)
group = map(itemgetter(1), g) threshold = cv2.dilate(threshold, char_el)
ranges.append((group[0], group[-1]))
y_cuts = [] try:
for r in ranges: __, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
y_cuts.append((r[0] + r[1]) / 2) cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
contours = [cv2.boundingRect(c) for c in contours]
y_cuts = [(c[1], c[1] + c[3]) for c in contours]
y_cuts = list(merge_tuples(sorted(y_cuts)))
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))]
return sorted(y_cuts, reverse=True) return sorted(y_cuts, reverse=True)

View File

@ -8,7 +8,7 @@ from PIL import Image
from .table import Table from .table import Table
from .imgproc import (adaptive_threshold, find_lines, find_table_contours, from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
find_table_joints, find_cuts) find_table_joints, remove_lines, find_cuts)
from .utils import merge_close_values, encode_list from .utils import merge_close_values, encode_list
@ -46,6 +46,10 @@ class OCRLattice:
Dots per inch. Dots per inch.
(optional, default: 300) (optional, default: 300)
layout : int
Tesseract page segmentation mode.
(optional, default: 7)
lang : string lang : string
Language to be used for OCR. Language to be used for OCR.
(optional, default: 'eng') (optional, default: 'eng')
@ -66,7 +70,7 @@ class OCRLattice:
(optional, default: None) (optional, default: None)
""" """
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2, def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
dpi=300, lang="eng", scale=15, iterations=0, debug=None): dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
self.method = 'ocrl' self.method = 'ocrl'
self.table_area = table_area self.table_area = table_area
@ -75,6 +79,7 @@ class OCRLattice:
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.tool = pyocr.get_available_tools()[0] # fix this self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi self.dpi = dpi
self.layout = layout
self.lang = lang self.lang = lang
self.scale = scale self.scale = scale
self.iterations = iterations self.iterations = iterations
@ -159,7 +164,7 @@ class OCRLattice:
text = self.tool.image_to_string( text = self.tool.image_to_string(
Image.fromarray(table.cells[i][j].image), Image.fromarray(table.cells[i][j].image),
lang=self.lang, lang=self.lang,
builder=pyocr.builders.TextBuilder() builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
) )
table.cells[i][j].add_text(text) table.cells[i][j].add_text(text)
ar = table.get_list() ar = table.get_list()
@ -203,31 +208,41 @@ class OCRStream:
zero or negative as well. zero or negative as well.
(optional, default: -2) (optional, default: -2)
line_threshold : int
Maximum intensity of projections on y-axis.
(optional, default: 100)
dpi : int dpi : int
Dots per inch. Dots per inch.
(optional, default: 300) (optional, default: 300)
layout : int
Tesseract page segmentation mode.
(optional, default: 7)
lang : string lang : string
Language to be used for OCR. Language to be used for OCR.
(optional, default: 'eng') (optional, default: 'eng')
line_scale : int
Line scaling factor.
(optional, default: 15)
char_scale : int
Char scaling factor.
(optional, default: 200)
""" """
def __init__(self, table_area=None, columns=None, blocksize=15, def __init__(self, table_area=None, columns=None, blocksize=15,
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng", threshold_constant=-2, dpi=300, layout=7, lang="eng",
debug=False): line_scale=15, char_scale=200, debug=False):
self.method = 'ocrs' self.method = 'ocrs'
self.table_area = table_area self.table_area = table_area
self.columns = columns self.columns = columns
self.blocksize = blocksize self.blocksize = blocksize
self.threshold_constant = threshold_constant self.threshold_constant = threshold_constant
self.line_threshold = line_threshold
self.tool = pyocr.get_available_tools()[0] # fix this self.tool = pyocr.get_available_tools()[0] # fix this
self.dpi = dpi self.dpi = dpi
self.layout = layout
self.lang = lang self.lang = lang
self.line_scale = line_scale
self.char_scale = char_scale
self.debug = debug self.debug = debug
def get_tables(self, pdfname): def get_tables(self, pdfname):
@ -251,6 +266,7 @@ class OCRStream:
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize, img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
c=self.threshold_constant) c=self.threshold_constant)
threshold = remove_lines(threshold, line_scale=self.line_scale)
height, width = threshold.shape height, width = threshold.shape
if self.debug: if self.debug:
self.debug_images = img self.debug_images = img
@ -287,7 +303,7 @@ class OCRStream:
cols.insert(0, k[0]) cols.insert(0, k[0])
cols.append(k[2]) cols.append(k[2])
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)] cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold) y_cuts = find_cuts(table_image, char_scale=self.char_scale)
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)] rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
table = Table(cols, rows) table = Table(cols, rows)
for i in range(len(table.cells)): for i in range(len(table.cells)):
@ -301,7 +317,7 @@ class OCRStream:
text = self.tool.image_to_string( text = self.tool.image_to_string(
cell_image, cell_image,
lang=self.lang, lang=self.lang,
builder=pyocr.builders.TextBuilder() builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
) )
table.cells[i][j].add_text(text) table.cells[i][j].add_text(text)
ar = table.get_list() ar = table.get_list()

View File

@ -751,4 +751,26 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
width = layout.bbox[2] width = layout.bbox[2]
height = layout.bbox[3] height = layout.bbox[3]
dim = (width, height) dim = (width, height)
return layout, dim return layout, dim
def merge_tuples(tuples):
"""Merges a list of overlapping tuples.
Parameters
----------
tuples : list
Returns
-------
merged : list
"""
merged = list(tuples[0])
for s, e in tuples:
if s <= merged[1]:
merged[1] = max(merged[1], e)
else:
yield tuple(merged)
merged[0] = s
merged[1] = e
yield tuple(merged)

View File

@ -121,6 +121,7 @@ options:
-C, --constant <constant> See adaptive threshold doc. [default: -2] -C, --constant <constant> See adaptive threshold doc. [default: -2]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR. -D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300] [default: 300]
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
-l, --lang <lang> Specify language to be used for OCR. [default: eng] -l, --lang <lang> Specify language to be used for OCR. [default: eng]
-s, --scale <scale> Scaling factor. Large scaling factor leads to -s, --scale <scale> Scaling factor. Large scaling factor leads to
smaller lines being detected. [default: 15] smaller lines being detected. [default: 15]
@ -141,11 +142,12 @@ options:
Example: -c 10.1,20.2,30.3 Example: -c 10.1,20.2,30.3
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15] -b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
-C, --constant <constant> See adaptive threshold doc. [default: -2] -C, --constant <constant> See adaptive threshold doc. [default: -2]
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
[default: 100]
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR. -D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
[default: 300] [default: 300]
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
-l, --lang <lang> Specify language to be used for OCR. [default: eng] -l, --lang <lang> Specify language to be used for OCR. [default: eng]
-G, --line-scale <line_scale> Line scaling factor. [default: 15]
-S, --char-scale <char_scale> Char scaling factor. [default: 200]
-d, --debug Debug by visualizing image. -d, --debug Debug by visualizing image.
""" """
@ -555,6 +557,7 @@ if __name__ == '__main__':
'blocksize': int(args['--blocksize']), 'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']), 'threshold_constant': float(args['--constant']),
'dpi': int(args['--dpi']), 'dpi': int(args['--dpi']),
'layout': int(args['--layout']),
'lang': args['--lang'], 'lang': args['--lang'],
'scale': int(args['--scale']), 'scale': int(args['--scale']),
'iterations': int(args['--iterations']), 'iterations': int(args['--iterations']),
@ -620,9 +623,11 @@ if __name__ == '__main__':
'columns': args['--columns'] if args['--columns'] else None, 'columns': args['--columns'] if args['--columns'] else None,
'blocksize': int(args['--blocksize']), 'blocksize': int(args['--blocksize']),
'threshold_constant': float(args['--constant']), 'threshold_constant': float(args['--constant']),
'line_threshold': int(args['--line-threshold']),
'dpi': int(args['--dpi']), 'dpi': int(args['--dpi']),
'layout': int(args['--layout']),
'lang': args['--lang'], 'lang': args['--lang'],
'line_scale': int(args['--line-scale']),
'char_scale': int(args['--char-scale']),
'debug': args['--debug'] 'debug': args['--debug']
} }
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True, manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,