Add better y-cuts detection
parent
76e1d32417
commit
e252e476b9
|
|
@ -4,6 +4,8 @@ from operator import itemgetter
|
|||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from .utils import merge_tuples
|
||||
|
||||
|
||||
def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
|
||||
"""Thresholds an image using OpenCV's adaptiveThreshold.
|
||||
|
|
@ -199,30 +201,72 @@ def find_table_joints(contours, vertical, horizontal):
|
|||
return tables
|
||||
|
||||
|
||||
def find_cuts(threshold, line_threshold=100):
|
||||
"""find_cuts
|
||||
def remove_lines(threshold, line_scale=15):
|
||||
"""Removes lines from a thresholded image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
line_threshold : int
|
||||
Maximum intensity of projections on y-axis.
|
||||
(optional, default: 100)
|
||||
line_scale : int
|
||||
Line scaling factor.
|
||||
(optional, default: 15)
|
||||
|
||||
Returns
|
||||
-------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image
|
||||
with horizontal and vertical lines removed.
|
||||
"""
|
||||
size = threshold.shape[0] // line_scale
|
||||
vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
|
||||
dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
|
||||
|
||||
vertical = cv2.erode(threshold, vertical_erode_el)
|
||||
vertical = cv2.dilate(vertical, dilate_el)
|
||||
|
||||
horizontal = cv2.erode(threshold, horizontal_erode_el)
|
||||
horizontal = cv2.dilate(horizontal, dilate_el)
|
||||
|
||||
threshold = np.bitwise_and(threshold, np.invert(vertical))
|
||||
threshold = np.bitwise_and(threshold, np.invert(horizontal))
|
||||
return threshold
|
||||
|
||||
|
||||
def find_cuts(threshold, char_scale=200):
|
||||
"""Finds cuts made by text projections on y-axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : object
|
||||
numpy.ndarray representing the thresholded image.
|
||||
|
||||
char_scale : int
|
||||
Char scaling factor.
|
||||
(optional, default: 200)
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_cuts : list
|
||||
List of cuts on y-axis.
|
||||
"""
|
||||
y_proj = np.sum(threshold, axis=1)
|
||||
y_proj_less = np.where(y_proj < line_threshold)[0]
|
||||
ranges = []
|
||||
for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
|
||||
group = map(itemgetter(1), g)
|
||||
ranges.append((group[0], group[-1]))
|
||||
y_cuts = []
|
||||
for r in ranges:
|
||||
y_cuts.append((r[0] + r[1]) / 2)
|
||||
size = threshold.shape[0] // char_scale
|
||||
char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
|
||||
|
||||
threshold = cv2.erode(threshold, char_el)
|
||||
threshold = cv2.dilate(threshold, char_el)
|
||||
|
||||
try:
|
||||
__, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE)
|
||||
except ValueError:
|
||||
contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
contours = [cv2.boundingRect(c) for c in contours]
|
||||
y_cuts = [(c[1], c[1] + c[3]) for c in contours]
|
||||
y_cuts = list(merge_tuples(sorted(y_cuts)))
|
||||
y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))]
|
||||
return sorted(y_cuts, reverse=True)
|
||||
|
|
@ -8,7 +8,7 @@ from PIL import Image
|
|||
|
||||
from .table import Table
|
||||
from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
|
||||
find_table_joints, find_cuts)
|
||||
find_table_joints, remove_lines, find_cuts)
|
||||
from .utils import merge_close_values, encode_list
|
||||
|
||||
|
||||
|
|
@ -46,6 +46,10 @@ class OCRLattice:
|
|||
Dots per inch.
|
||||
(optional, default: 300)
|
||||
|
||||
layout : int
|
||||
Tesseract page segmentation mode.
|
||||
(optional, default: 7)
|
||||
|
||||
lang : string
|
||||
Language to be used for OCR.
|
||||
(optional, default: 'eng')
|
||||
|
|
@ -66,7 +70,7 @@ class OCRLattice:
|
|||
(optional, default: None)
|
||||
"""
|
||||
def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
|
||||
dpi=300, lang="eng", scale=15, iterations=0, debug=None):
|
||||
dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):
|
||||
|
||||
self.method = 'ocrl'
|
||||
self.table_area = table_area
|
||||
|
|
@ -75,6 +79,7 @@ class OCRLattice:
|
|||
self.threshold_constant = threshold_constant
|
||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||
self.dpi = dpi
|
||||
self.layout = layout
|
||||
self.lang = lang
|
||||
self.scale = scale
|
||||
self.iterations = iterations
|
||||
|
|
@ -159,7 +164,7 @@ class OCRLattice:
|
|||
text = self.tool.image_to_string(
|
||||
Image.fromarray(table.cells[i][j].image),
|
||||
lang=self.lang,
|
||||
builder=pyocr.builders.TextBuilder()
|
||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||
)
|
||||
table.cells[i][j].add_text(text)
|
||||
ar = table.get_list()
|
||||
|
|
@ -203,31 +208,41 @@ class OCRStream:
|
|||
zero or negative as well.
|
||||
(optional, default: -2)
|
||||
|
||||
line_threshold : int
|
||||
Maximum intensity of projections on y-axis.
|
||||
(optional, default: 100)
|
||||
|
||||
dpi : int
|
||||
Dots per inch.
|
||||
(optional, default: 300)
|
||||
|
||||
layout : int
|
||||
Tesseract page segmentation mode.
|
||||
(optional, default: 7)
|
||||
|
||||
lang : string
|
||||
Language to be used for OCR.
|
||||
(optional, default: 'eng')
|
||||
|
||||
line_scale : int
|
||||
Line scaling factor.
|
||||
(optional, default: 15)
|
||||
|
||||
char_scale : int
|
||||
Char scaling factor.
|
||||
(optional, default: 200)
|
||||
"""
|
||||
def __init__(self, table_area=None, columns=None, blocksize=15,
|
||||
threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
|
||||
debug=False):
|
||||
threshold_constant=-2, dpi=300, layout=7, lang="eng",
|
||||
line_scale=15, char_scale=200, debug=False):
|
||||
|
||||
self.method = 'ocrs'
|
||||
self.table_area = table_area
|
||||
self.columns = columns
|
||||
self.blocksize = blocksize
|
||||
self.threshold_constant = threshold_constant
|
||||
self.line_threshold = line_threshold
|
||||
self.tool = pyocr.get_available_tools()[0] # fix this
|
||||
self.dpi = dpi
|
||||
self.layout = layout
|
||||
self.lang = lang
|
||||
self.line_scale = line_scale
|
||||
self.char_scale = char_scale
|
||||
self.debug = debug
|
||||
|
||||
def get_tables(self, pdfname):
|
||||
|
|
@ -251,6 +266,7 @@ class OCRStream:
|
|||
|
||||
img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
|
||||
c=self.threshold_constant)
|
||||
threshold = remove_lines(threshold, line_scale=self.line_scale)
|
||||
height, width = threshold.shape
|
||||
if self.debug:
|
||||
self.debug_images = img
|
||||
|
|
@ -287,7 +303,7 @@ class OCRStream:
|
|||
cols.insert(0, k[0])
|
||||
cols.append(k[2])
|
||||
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
|
||||
y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
|
||||
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
|
||||
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
|
||||
table = Table(cols, rows)
|
||||
for i in range(len(table.cells)):
|
||||
|
|
@ -301,7 +317,7 @@ class OCRStream:
|
|||
text = self.tool.image_to_string(
|
||||
cell_image,
|
||||
lang=self.lang,
|
||||
builder=pyocr.builders.TextBuilder()
|
||||
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
|
||||
)
|
||||
table.cells[i][j].add_text(text)
|
||||
ar = table.get_list()
|
||||
|
|
|
|||
|
|
@ -751,4 +751,26 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
|
|||
width = layout.bbox[2]
|
||||
height = layout.bbox[3]
|
||||
dim = (width, height)
|
||||
return layout, dim
|
||||
return layout, dim
|
||||
|
||||
|
||||
def merge_tuples(tuples):
|
||||
"""Merges a list of overlapping tuples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuples : list
|
||||
|
||||
Returns
|
||||
-------
|
||||
merged : list
|
||||
"""
|
||||
merged = list(tuples[0])
|
||||
for s, e in tuples:
|
||||
if s <= merged[1]:
|
||||
merged[1] = max(merged[1], e)
|
||||
else:
|
||||
yield tuple(merged)
|
||||
merged[0] = s
|
||||
merged[1] = e
|
||||
yield tuple(merged)
|
||||
|
|
@ -121,6 +121,7 @@ options:
|
|||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||
[default: 300]
|
||||
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||
-s, --scale <scale> Scaling factor. Large scaling factor leads to
|
||||
smaller lines being detected. [default: 15]
|
||||
|
|
@ -141,11 +142,12 @@ options:
|
|||
Example: -c 10.1,20.2,30.3
|
||||
-b, --blocksize <blocksize> See adaptive threshold doc. [default: 15]
|
||||
-C, --constant <constant> See adaptive threshold doc. [default: -2]
|
||||
-N, --line-threshold <line_threshold> Maximum intensity of projections on y-axis.
|
||||
[default: 100]
|
||||
-D, --dpi <dpi> Dots per inch, specify image quality to be used for OCR.
|
||||
[default: 300]
|
||||
-g, --layout <layout> Tesseract page segmentation mode. [default: 7]
|
||||
-l, --lang <lang> Specify language to be used for OCR. [default: eng]
|
||||
-G, --line-scale <line_scale> Line scaling factor. [default: 15]
|
||||
-S, --char-scale <char_scale> Char scaling factor. [default: 200]
|
||||
-d, --debug Debug by visualizing image.
|
||||
"""
|
||||
|
||||
|
|
@ -555,6 +557,7 @@ if __name__ == '__main__':
|
|||
'blocksize': int(args['--blocksize']),
|
||||
'threshold_constant': float(args['--constant']),
|
||||
'dpi': int(args['--dpi']),
|
||||
'layout': int(args['--layout']),
|
||||
'lang': args['--lang'],
|
||||
'scale': int(args['--scale']),
|
||||
'iterations': int(args['--iterations']),
|
||||
|
|
@ -620,9 +623,11 @@ if __name__ == '__main__':
|
|||
'columns': args['--columns'] if args['--columns'] else None,
|
||||
'blocksize': int(args['--blocksize']),
|
||||
'threshold_constant': float(args['--constant']),
|
||||
'line_threshold': int(args['--line-threshold']),
|
||||
'dpi': int(args['--dpi']),
|
||||
'layout': int(args['--layout']),
|
||||
'lang': args['--lang'],
|
||||
'line_scale': int(args['--line-scale']),
|
||||
'char_scale': int(args['--char-scale']),
|
||||
'debug': args['--debug']
|
||||
}
|
||||
manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
|
||||
|
|
|
|||
Loading…
Reference in New Issue