Add better y-cuts detection

2017-04-25 18:44:53 +05:30 · 2017-04-25 18:44:53 +05:30 · e252e476b9
parent 76e1d32417
commit e252e476b9
4 changed files with 117 additions and 30 deletions
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@ -4,6 +4,8 @@ from operator import itemgetter
 import cv2
 import numpy as np

+from .utils import merge_tuples
+

 def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
    """Thresholds an image using OpenCV's adaptiveThreshold.
@ -199,30 +201,72 @@ def find_table_joints(contours, vertical, horizontal):
    return tables


-def find_cuts(threshold, line_threshold=100):
-    """find_cuts
+def remove_lines(threshold, line_scale=15):
+    """Removes lines from a thresholded image.

    Parameters
    ----------
    threshold : object
        numpy.ndarray representing the thresholded image.

-    line_threshold : int
-        Maximum intensity of projections on y-axis.
-        (optional, default: 100)
+    line_scale : int
+        Line scaling factor.
+        (optional, default: 15)
+
+    Returns
+    -------
+    threshold : object
+        numpy.ndarray representing the thresholded image
+        with horizontal and vertical lines removed.
+    """
+    size = threshold.shape[0] // line_scale
+    vertical_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
+    horizontal_erode_el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
+    dilate_el = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
+
+    vertical = cv2.erode(threshold, vertical_erode_el)
+    vertical = cv2.dilate(vertical, dilate_el)
+
+    horizontal = cv2.erode(threshold, horizontal_erode_el)
+    horizontal = cv2.dilate(horizontal, dilate_el)
+
+    threshold = np.bitwise_and(threshold, np.invert(vertical))
+    threshold = np.bitwise_and(threshold, np.invert(horizontal))
+    return threshold
+
+
+def find_cuts(threshold, char_scale=200):
+    """Finds cuts made by text projections on y-axis.
+
+    Parameters
+    ----------
+    threshold : object
+        numpy.ndarray representing the thresholded image.
+
+    char_scale : int
+        Char scaling factor.
+        (optional, default: 200)

    Returns
    -------
    y_cuts : list
        List of cuts on y-axis.
    """
-    y_proj = np.sum(threshold, axis=1)
-    y_proj_less = np.where(y_proj < line_threshold)[0]
-    ranges = []
-    for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
-        group = map(itemgetter(1), g)
-        ranges.append((group[0], group[-1]))
-    y_cuts = []
-    for r in ranges:
-        y_cuts.append((r[0] + r[1]) / 2)
+    size = threshold.shape[0] // char_scale
+    char_el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
+
+    threshold = cv2.erode(threshold, char_el)
+    threshold = cv2.dilate(threshold, char_el)
+
+    try:
+        __, contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        contours, __ = cv2.findContours(threshold, cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE)
+
+    contours = [cv2.boundingRect(c) for c in contours]
+    y_cuts = [(c[1], c[1] + c[3]) for c in contours]
+    y_cuts = list(merge_tuples(sorted(y_cuts)))
+    y_cuts = [(y_cuts[i][0] + y_cuts[i - 1][1]) / 2 for i in range(1, len(y_cuts))]
    return sorted(y_cuts, reverse=True)
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -8,7 +8,7 @@ from PIL import Image

 from .table import Table
 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
-                      find_table_joints, find_cuts)
+                      find_table_joints, remove_lines, find_cuts)
 from .utils import merge_close_values, encode_list


@ -46,6 +46,10 @@ class OCRLattice:
        Dots per inch.
        (optional, default: 300)

+    layout : int
+        Tesseract page segmentation mode.
+        (optional, default: 7)
+
    lang : string
        Language to be used for OCR.
        (optional, default: 'eng')
@ -66,7 +70,7 @@ class OCRLattice:
        (optional, default: None)
    """
    def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
-                 dpi=300, lang="eng", scale=15, iterations=0, debug=None):
+                 dpi=300, layout=7, lang="eng", scale=15, iterations=0, debug=None):

        self.method = 'ocrl'
        self.table_area = table_area
@ -75,6 +79,7 @@ class OCRLattice:
        self.threshold_constant = threshold_constant
        self.tool = pyocr.get_available_tools()[0] # fix this
        self.dpi = dpi
+        self.layout = layout
        self.lang = lang
        self.scale = scale
        self.iterations = iterations
@ -159,7 +164,7 @@ class OCRLattice:
                    text = self.tool.image_to_string(
                        Image.fromarray(table.cells[i][j].image),
                        lang=self.lang,
-                        builder=pyocr.builders.TextBuilder()
+                        builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
                    )
                    table.cells[i][j].add_text(text)
            ar = table.get_list()
@ -203,31 +208,41 @@ class OCRStream:
        zero or negative as well.
        (optional, default: -2)

-    line_threshold : int
-        Maximum intensity of projections on y-axis.
-        (optional, default: 100)
-
    dpi : int
        Dots per inch.
        (optional, default: 300)

+    layout : int
+        Tesseract page segmentation mode.
+        (optional, default: 7)
+
    lang : string
        Language to be used for OCR.
        (optional, default: 'eng')
+
+    line_scale : int
+        Line scaling factor.
+        (optional, default: 15)
+
+    char_scale : int
+        Char scaling factor.
+        (optional, default: 200)
    """
    def __init__(self, table_area=None, columns=None, blocksize=15,
-                 threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
-                 debug=False):
+                 threshold_constant=-2, dpi=300, layout=7, lang="eng",
+                 line_scale=15, char_scale=200, debug=False):

        self.method = 'ocrs'
        self.table_area = table_area
        self.columns = columns
        self.blocksize = blocksize
        self.threshold_constant = threshold_constant
-        self.line_threshold = line_threshold
        self.tool = pyocr.get_available_tools()[0] # fix this
        self.dpi = dpi
+        self.layout = layout
        self.lang = lang
+        self.line_scale = line_scale
+        self.char_scale = char_scale
        self.debug = debug

    def get_tables(self, pdfname):
@ -251,6 +266,7 @@ class OCRStream:

        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
            c=self.threshold_constant)
+        threshold = remove_lines(threshold, line_scale=self.line_scale)
        height, width = threshold.shape
        if self.debug:
            self.debug_images = img
@ -287,7 +303,7 @@ class OCRStream:
                cols.insert(0, k[0])
                cols.append(k[2])
                cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
-                y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
+                y_cuts = find_cuts(table_image, char_scale=self.char_scale)
                rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
                table = Table(cols, rows)
                for i in range(len(table.cells)):
@ -301,7 +317,7 @@ class OCRStream:
                        text = self.tool.image_to_string(
                            cell_image,
                            lang=self.lang,
-                            builder=pyocr.builders.TextBuilder()
+                            builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
                        )
                        table.cells[i][j].add_text(text)
                ar = table.get_list()
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -751,4 +751,26 @@ def get_page_layout(pname, char_margin=1.0, line_margin=0.5, word_margin=0.1,
            width = layout.bbox[2]
            height = layout.bbox[3]
            dim = (width, height)
-        return layout, dim
+        return layout, dim
+
+
+def merge_tuples(tuples):
+    """Merges a list of overlapping tuples.
+
+    Parameters
+    ----------
+    tuples : list
+
+    Returns
+    -------
+    merged : list
+    """
+    merged = list(tuples[0])
+    for s, e in tuples:
+        if s <= merged[1]:
+            merged[1] = max(merged[1], e)
+        else:
+            yield tuple(merged)
+            merged[0] = s
+            merged[1] = e
+    yield tuple(merged)
--- a/tools/camelot
+++ b/tools/camelot
@ -121,6 +121,7 @@ options:
 -C, --constant <constant>      See adaptive threshold doc. [default: -2]
 -D, --dpi <dpi>                Dots per inch, specify image quality to be used for OCR.
                                [default: 300]
+ -g, --layout <layout>          Tesseract page segmentation mode. [default: 7]
 -l, --lang <lang>              Specify language to be used for OCR. [default: eng]
 -s, --scale <scale>            Scaling factor. Large scaling factor leads to
                                smaller lines being detected. [default: 15]
@ -141,11 +142,12 @@ options:
                                        Example: -c 10.1,20.2,30.3
 -b, --blocksize <blocksize>            See adaptive threshold doc. [default: 15]
 -C, --constant <constant>              See adaptive threshold doc. [default: -2]
- -N, --line-threshold <line_threshold>  Maximum intensity of projections on y-axis.
-                                        [default: 100]
 -D, --dpi <dpi>                        Dots per inch, specify image quality to be used for OCR.
                                        [default: 300]
+ -g, --layout <layout>                  Tesseract page segmentation mode. [default: 7]
 -l, --lang <lang>                      Specify language to be used for OCR. [default: eng]
+ -G, --line-scale <line_scale>          Line scaling factor. [default: 15]
+ -S, --char-scale <char_scale>          Char scaling factor. [default: 200]
 -d, --debug                            Debug by visualizing image.
 """

@ -555,6 +557,7 @@ if __name__ == '__main__':
                'blocksize': int(args['--blocksize']),
                'threshold_constant': float(args['--constant']),
                'dpi': int(args['--dpi']),
+                'layout': int(args['--layout']),
                'lang': args['--lang'],
                'scale': int(args['--scale']),
                'iterations': int(args['--iterations']),
@ -620,9 +623,11 @@ if __name__ == '__main__':
                'columns': args['--columns'] if args['--columns'] else None,
                'blocksize': int(args['--blocksize']),
                'threshold_constant': float(args['--constant']),
-                'line_threshold': int(args['--line-threshold']),
                'dpi': int(args['--dpi']),
+                'layout': int(args['--layout']),
                'lang': args['--lang'],
+                'line_scale': int(args['--line-scale']),
+                'char_scale': int(args['--char-scale']),
                'debug': args['--debug']
            }
            manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,