Add better y-cuts detection
parent
76e1d32417
commit
e252e476b9
|
|
@ -4,6 +4,8 @@ from operator import itemgetter
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
from .utils import merge_tuples
|
||||||
|
|
||||||
|
|
||||||
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||||
|
|
@ -199,30 +201,72 @@ def find_table_joints(contours, vertical, horizontal):
|
||||||
return tables
|
return tables
|
||||||
|
|
||||||
|
|
||||||
def find_cuts(threshold, line_threshold=100):
|
def remove_lines(threshold, line_scale=15):
|
||||||
"""find_cuts
|
"""Removes lines from a thresholded image.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
threshold : object
|
threshold : object
|
||||||
numpy.ndarray representing the thresholded image.
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
line_threshold : int
|
line_scale : int
|
||||||
Maximum intensity of projections on y-axis.
|
Line scaling factor.
|
||||||
(optional, default: 100)
|
(optional, default: 15)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
threshold : object
|
||||||
|
numpy.ndarray representing the thresholded image
|
||||||
|
with horizontal and vertical lines removed.
|
||||||
|
"""
|
||||||
|
size = threshold.shape[0] // line_scale
|
||||||
|
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
|
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||||
|
dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
|
||||||
|
|
||||||
|
vertical = cv2.erode(threshold, vertical_erode_el)
|
||||||
|
vertical = cv2.dilate(vertical, dilate_el)
|
||||||
|
|
||||||
|
horizontal = cv2.erode(threshold, horizontal_erode_el)
|
||||||
|
horizontal = cv2.dilate(horizontal, dilate_el)
|
||||||
|
|
||||||
|
threshold = np.bitwise_and(threshold, np.invert(vertical))
|
||||||
|
threshold = np.bitwise_and(threshold, np.invert(horizontal))
|
||||||
|
return threshold
|
||||||
|
|
||||||
|
|
||||||
|
def find_cuts(threshold, char_scale=200):
|
||||||
|
"""Finds cuts made by text projections on y-axis.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
threshold : object
|
||||||
|
numpy.ndarray representing the thresholded image.
|
||||||
|
|
||||||
|
char_scale : int
|
||||||
|
Char scaling factor.
|
||||||
|
(optional, default: 200)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
y_cuts : list
|
y_cuts : list
|
||||||
List of cuts on y-axis.
|
List of cuts on y-axis.
|
||||||
"""
|
"""
|
||||||
y_proj = np.sum(threshold, axis=1)
|
size = threshold.shape[0] // char_scale
|
||||||
y_proj_less = np.where(y_proj < line_threshold)[0]
|
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||||
ranges = []
|
|
||||||
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
|
threshold = cv2.erode(threshold, char_el)
|
||||||
group = map(itemgetter(1), g)
|
threshold = cv2.dilate(threshold, char_el)
|
||||||
ranges.append((group[0], group[-1]))
|
|
||||||
y_cuts = []
|
try:
|
||||||
for r in ranges:
|
__, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
|
||||||
y_cuts.append((r[0] + r[1]) / 2)
|
cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
except ValueError:
|
||||||
|
contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
|
||||||
|
cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
contours = [cv2.boundingRect(c) for c in contours]
|
||||||
|
y_cuts = [(c[1], c[1] + c[3]) for c in contours]
|
||||||
|
y_cuts = list(merge_tuples(sorted(y_cuts)))
|
||||||
|
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))]
|
||||||
return sorted(y_cuts, reverse=True)
|
return sorted(y_cuts, reverse=True)
|
||||||
|
|
@ -8,7 +8,7 @@ from PIL import Image
|
||||||
|
|
||||||
from .table import Table
|
from .table import Table
|
||||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||||
find_table_joints, find_cuts)
|
find_table_joints, remove_lines, find_cuts)
|
||||||
from .utils import merge_close_values, encode_list
|
from .utils import merge_close_values, encode_list
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -46,6 +46,10 @@ class OCRLattice:
|
||||||
Dots per inch.
|
Dots per inch.
|
||||||
(optional, default: 300)
|
(optional, default: 300)
|
||||||
|
|
||||||
|
layout : int
|
||||||
|
Tesseract page segmentation mode.
|
||||||
|
(optional, default: 7)
|
||||||
|
|
||||||
lang : string
|
lang : string
|
||||||
Language to be used for OCR.
|
Language to be used for OCR.
|
||||||
(optional, default: 'eng')
|
(optional, default: 'eng')
|
||||||
|
|
@ -66,7 +70,7 @@ class OCRLattice:
|
||||||
(optional, default: None)
|
(optional, default: None)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
||||||
dpi=300, lang="eng", scale=15, iterations=0, debug=None):
|
dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
|
||||||
|
|
||||||
self.method = 'ocrl'
|
self.method = 'ocrl'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
|
|
@ -75,6 +79,7 @@ class OCRLattice:
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||||
self.dpi = dpi
|
self.dpi = dpi
|
||||||
|
self.layout = layout
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.iterations = iterations
|
self.iterations = iterations
|
||||||
|
|
@ -159,7 +164,7 @@ class OCRLattice:
|
||||||
text = self.tool.image_to_string(
|
text = self.tool.image_to_string(
|
||||||
Image.fromarray(table.cells[i][j].image),
|
Image.fromarray(table.cells[i][j].image),
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
builder=pyocr.builders.TextBuilder()
|
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||||
)
|
)
|
||||||
table.cells[i][j].add_text(text)
|
table.cells[i][j].add_text(text)
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
|
|
@ -203,31 +208,41 @@ class OCRStream:
|
||||||
zero or negative as well.
|
zero or negative as well.
|
||||||
(optional, default: -2)
|
(optional, default: -2)
|
||||||
|
|
||||||
line_threshold : int
|
|
||||||
Maximum intensity of projections on y-axis.
|
|
||||||
(optional, default: 100)
|
|
||||||
|
|
||||||
dpi : int
|
dpi : int
|
||||||
Dots per inch.
|
Dots per inch.
|
||||||
(optional, default: 300)
|
(optional, default: 300)
|
||||||
|
|
||||||
|
layout : int
|
||||||
|
Tesseract page segmentation mode.
|
||||||
|
(optional, default: 7)
|
||||||
|
|
||||||
lang : string
|
lang : string
|
||||||
Language to be used for OCR.
|
Language to be used for OCR.
|
||||||
(optional, default: 'eng')
|
(optional, default: 'eng')
|
||||||
|
|
||||||
|
line_scale : int
|
||||||
|
Line scaling factor.
|
||||||
|
(optional, default: 15)
|
||||||
|
|
||||||
|
char_scale : int
|
||||||
|
Char scaling factor.
|
||||||
|
(optional, default: 200)
|
||||||
"""
|
"""
|
||||||
def __init__(self, table_area=None, columns=None, blocksize=15,
|
def __init__(self, table_area=None, columns=None, blocksize=15,
|
||||||
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
|
threshold_constant=-2, dpi=300, layout=7, lang="eng",
|
||||||
debug=False):
|
line_scale=15, char_scale=200, debug=False):
|
||||||
|
|
||||||
self.method = 'ocrs'
|
self.method = 'ocrs'
|
||||||
self.table_area = table_area
|
self.table_area = table_area
|
||||||
self.columns = columns
|
self.columns = columns
|
||||||
self.blocksize = blocksize
|
self.blocksize = blocksize
|
||||||
self.threshold_constant = threshold_constant
|
self.threshold_constant = threshold_constant
|
||||||
self.line_threshold = line_threshold
|
|
||||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||||
self.dpi = dpi
|
self.dpi = dpi
|
||||||
|
self.layout = layout
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
self.line_scale = line_scale
|
||||||
|
self.char_scale = char_scale
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
|
||||||
def get_tables(self, pdfname):
|
def get_tables(self, pdfname):
|
||||||
|
|
@ -251,6 +266,7 @@ class OCRStream:
|
||||||
|
|
||||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||||
c=self.threshold_constant)
|
c=self.threshold_constant)
|
||||||
|
threshold = remove_lines(threshold, line_scale=self.line_scale)
|
||||||
height, width = threshold.shape
|
height, width = threshold.shape
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.debug_images = img
|
self.debug_images = img
|
||||||
|
|
@ -287,7 +303,7 @@ class OCRStream:
|
||||||
cols.insert(0, k[0])
|
cols.insert(0, k[0])
|
||||||
cols.append(k[2])
|
cols.append(k[2])
|
||||||
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
||||||
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
|
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
|
||||||
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
||||||
table = Table(cols, rows)
|
table = Table(cols, rows)
|
||||||
for i in range(len(table.cells)):
|
for i in range(len(table.cells)):
|
||||||
|
|
@ -301,7 +317,7 @@ class OCRStream:
|
||||||
text = self.tool.image_to_string(
|
text = self.tool.image_to_string(
|
||||||
cell_image,
|
cell_image,
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
builder=pyocr.builders.TextBuilder()
|
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||||
)
|
)
|
||||||
table.cells[i][j].add_text(text)
|
table.cells[i][j].add_text(text)
|
||||||
ar = table.get_list()
|
ar = table.get_list()
|
||||||
|
|
|
||||||
|
|
@ -752,3 +752,25 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
||||||
height = layout.bbox[3]
|
height = layout.bbox[3]
|
||||||
dim = (width, height)
|
dim = (width, height)
|
||||||
return layout, dim
|
return layout, dim
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tuples(tuples):
|
||||||
|
"""Merges a list of overlapping tuples.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tuples : list
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
merged : list
|
||||||
|
"""
|
||||||
|
merged = list(tuples[0])
|
||||||
|
for s, e in tuples:
|
||||||
|
if s <= merged[1]:
|
||||||
|
merged[1] = max(merged[1], e)
|
||||||
|
else:
|
||||||
|
yield tuple(merged)
|
||||||
|
merged[0] = s
|
||||||
|
merged[1] = e
|
||||||
|
yield tuple(merged)
|
||||||
|
|
@ -121,6 +121,7 @@ options:
|
||||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
[default: 300]
|
[default: 300]
|
||||||
|
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||||
smaller lines being detected. [default: 15]
|
smaller lines being detected. [default: 15]
|
||||||
|
|
@ -141,11 +142,12 @@ options:
|
||||||
Example: -c 10.1,20.2,30.3
|
Example: -c 10.1,20.2,30.3
|
||||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||||
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
|
|
||||||
[default: 100]
|
|
||||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||||
[default: 300]
|
[default: 300]
|
||||||
|
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
||||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||||
|
-G, --line-scale <line_scale> Line scaling factor. [default: 15]
|
||||||
|
-S, --char-scale <char_scale> Char scaling factor. [default: 200]
|
||||||
-d, --debug Debug by visualizing image.
|
-d, --debug Debug by visualizing image.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -555,6 +557,7 @@ if __name__ == '__main__':
|
||||||
'blocksize': int(args['--blocksize']),
|
'blocksize': int(args['--blocksize']),
|
||||||
'threshold_constant': float(args['--constant']),
|
'threshold_constant': float(args['--constant']),
|
||||||
'dpi': int(args['--dpi']),
|
'dpi': int(args['--dpi']),
|
||||||
|
'layout': int(args['--layout']),
|
||||||
'lang': args['--lang'],
|
'lang': args['--lang'],
|
||||||
'scale': int(args['--scale']),
|
'scale': int(args['--scale']),
|
||||||
'iterations': int(args['--iterations']),
|
'iterations': int(args['--iterations']),
|
||||||
|
|
@ -620,9 +623,11 @@ if __name__ == '__main__':
|
||||||
'columns': args['--columns'] if args['--columns'] else None,
|
'columns': args['--columns'] if args['--columns'] else None,
|
||||||
'blocksize': int(args['--blocksize']),
|
'blocksize': int(args['--blocksize']),
|
||||||
'threshold_constant': float(args['--constant']),
|
'threshold_constant': float(args['--constant']),
|
||||||
'line_threshold': int(args['--line-threshold']),
|
|
||||||
'dpi': int(args['--dpi']),
|
'dpi': int(args['--dpi']),
|
||||||
|
'layout': int(args['--layout']),
|
||||||
'lang': args['--lang'],
|
'lang': args['--lang'],
|
||||||
|
'line_scale': int(args['--line-scale']),
|
||||||
|
'char_scale': int(args['--char-scale']),
|
||||||
'debug': args['--debug']
|
'debug': args['--debug']
|
||||||
}
|
}
|
||||||
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
|
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue