[ENH] Add OCR and better joint detection

* Add iterations for dilation * Add OCRLattice and OCRStream * Add debug
2017-04-18 18:25:47 +05:30 · 2017-04-18 18:25:47 +05:30 · 4da754ddcb
parent dd909e2b53
commit 4da754ddcb
8 changed files with 411 additions and 156 deletions
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@ -1,3 +1,6 @@
+from itertools import groupby
+from operator import itemgetter
+
 import cv2
 import numpy as np

@ -44,7 +47,7 @@ def adaptive_threshold(imagename, invert=False, blocksize=15, c=-2):
    return img, threshold


-def find_lines(threshold, direction='horizontal', scale=15):
+def find_lines(threshold, direction='horizontal', scale=15, iterations=2):
    """Finds horizontal and vertical lines by applying morphological
    transformations on an image.

@ -62,6 +65,10 @@ def find_lines(threshold, direction='horizontal', scale=15):
        for morph transform.
        (optional, default: 15)

+    iterations : int
+        Number of iterations for dilation.
+        (optional, default: 2)
+
    Returns
    -------
    dmask : object
@ -85,10 +92,9 @@ def find_lines(threshold, direction='horizontal', scale=15):
        raise ValueError("Specify direction as either 'vertical' or"
                         " 'horizontal'")

-    threshold = cv2.erode(threshold, el, (-1, -1))
-    threshold = cv2.dilate(threshold, el, (-1, -1))
-
-    dmask = threshold  # findContours modifies source image
+    threshold = cv2.erode(threshold, el)
+    threshold = cv2.dilate(threshold, el)
+    dmask = cv2.dilate(threshold, el, iterations=iterations)

    try:
        _, contours, _ = cv2.findContours(
@ -190,4 +196,33 @@ def find_table_joints(contours, vertical, horizontal):
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords

-    return tables
+    return tables
+
+
+def find_cuts(threshold, line_threshold=100):
+    """find_cuts
+
+    Parameters
+    ----------
+    threshold : object
+        numpy.ndarray representing the thresholded image.
+
+    line_threshold : int
+        Maximum intensity of projections on y-axis.
+        (optional, default: 100)
+
+    Returns
+    -------
+    y_cuts : list
+        List of cuts on y-axis.
+    """
+    y_proj = np.sum(threshold, axis=1)
+    y_proj_less = np.where(y_proj < line_threshold)[0]
+    ranges = []
+    for k, g in groupby(enumerate(y_proj_less), lambda (i, x): i-x):
+        group = map(itemgetter(1), g)
+        ranges.append((group[0], group[-1]))
+    y_cuts = []
+    for r in ranges:
+        y_cuts.append((r[0] + r[1]) / 2)
+    return sorted(y_cuts, reverse=True)
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -12,7 +12,7 @@ from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
 from .table import Table
 from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_in_bbox,
                    merge_close_values, get_table_index, get_score, count_empty,
-                    encode_list, get_text_objects, get_page_layout)
+                    encode_list, get_text_objects, get_page_layout, remove_empty)


 __all__ = ['Lattice']
@ -131,20 +131,20 @@ class Lattice:
        direction.
        (optional, default: None)

-    headers : list
-        List of strings where each string is a csv header for a table.
-        (optional, default: None)
-
    mtol : list
        List of ints specifying m-tolerance parameters.
        (optional, default: [2])

-    blocksize: int
+    jtol : list
+        List of ints specifying j-tolerance parameters.
+        (optional, default: [2])
+
+    blocksize : int
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        (optional, default: 15)

-    threshold_constant: float
+    threshold_constant : float
        Constant subtracted from the mean or weighted mean
        (see the details below). Normally, it is positive but may be
        zero or negative as well.
@ -155,6 +155,10 @@ class Lattice:
        element for image processing.
        (optional, default: 15)

+    iterations : int
+        Number of iterations for dilation.
+        (optional, default: 2)
+
    invert : bool
        Whether or not to invert the image. Useful when pdfs have
        tables with lines in background.
@ -187,19 +191,20 @@ class Lattice:
        of detected contours, lines, joints and the table generated.
        (optional, default: None)
    """
-    def __init__(self, table_area=None, fill=None, headers=None, mtol=[2],
-                 blocksize=15, threshold_constant=-2, scale=15, invert=False,
-                 margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
-                 shift_text=['l', 't'], debug=None):
+    def __init__(self, table_area=None, fill=None, mtol=[2], jtol=[2],
+                 blocksize=15, threshold_constant=-2, scale=15, iterations=2,
+                 invert=False, margins=(1.0, 0.5, 0.1), split_text=False,
+                 flag_size=True, shift_text=['l', 't'], debug=None):

        self.method = 'lattice'
        self.table_area = table_area
        self.fill = fill
-        self.headers = headers
        self.mtol = mtol
+        self.jtol = jtol
        self.blocksize = blocksize
        self.threshold_constant = threshold_constant
        self.scale = scale
+        self.iterations = iterations
        self.invert = invert
        self.char_margin, self.line_margin, self.word_margin = margins
        self.split_text = split_text
@ -257,17 +262,14 @@ class Lattice:
        factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)

        vmask, v_segments = find_lines(threshold, direction='vertical',
-            scale=self.scale)
+            scale=self.scale, iterations=self.iterations)
        hmask, h_segments = find_lines(threshold, direction='horizontal',
-            scale=self.scale)
+            scale=self.scale, iterations=self.iterations)

        if self.table_area is not None:
            if self.fill is not None:
                if len(self.table_area) != len(self.fill):
-                    raise ValueError("Length of fill should be equal to table_area.")
-            if self.headers is not None:
-                if len(self.table_area) != len(self.headers):
-                    raise ValueError("Length of headers should be equal to table_area.")
+                    raise ValueError("Length of table area and fill should be equal.")

            areas = []
            for area in self.table_area:
@ -288,6 +290,11 @@ class Lattice:
        else:
            mtolerance = copy.deepcopy(self.mtol)

+        if len(self.jtol) == 1 and self.jtol[0] == 2:
+            jtolerance = copy.deepcopy(self.jtol) * len(table_bbox)
+        else:
+            jtolerance = copy.deepcopy(self.jtol)
+
        if self.debug:
            self.debug_images = (img, table_bbox)

@ -326,18 +333,9 @@ class Lattice:
            rows = [(rows[i], rows[i + 1])
                    for i in range(0, len(rows) - 1)]

-            if self.headers is not None and self.headers[table_no] != [""]:
-                self.headers[table_no] = self.headers[table_no].split(',')
-                if len(self.headers[table_no]) != len(cols):
-                    logger.warning("Length of header ({0}) specified for table is not"
-                                   " equal to the number of columns ({1}) detected.".format(
-                                   len(self.headers[table_no]), len(cols)))
-                while len(self.headers[table_no]) != len(cols):
-                    self.headers[table_no].append('')
-
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
-            table = table.set_edges(v_s, h_s)
+            table = table.set_edges(v_s, h_s, jtol=jtolerance[table_no])
            nouse = table.nocont_ / (len(v_s) + len(h_s))
            table_data['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
@ -351,27 +349,27 @@ class Lattice:
            assignment_errors = []
            table_data['split_text'] = []
            table_data['superscript'] = []
-            for direction in t_bbox:
+            for direction in ['vertical', 'horizontal']:
                for t in t_bbox[direction]:
                    indices, error = get_table_index(
                        table, t, direction, split_text=self.split_text,
                        flag_size=self.flag_size)
-                    assignment_errors.append(error)
-                    indices = _reduce_index(table, indices, shift_text=self.shift_text,)
-                    if len(indices) > 1:
-                        table_data['split_text'].append(indices)
-                    for r_idx, c_idx, text in indices:
-                        if all(s in text for s in ['<s>', '</s>']):
-                            table_data['superscript'].append((r_idx, c_idx, text))
-                        table.cells[r_idx][c_idx].add_text(text)
+                    if indices[:2] != (-1, -1):
+                        assignment_errors.append(error)
+                        indices = _reduce_index(table, indices, shift_text=self.shift_text)
+                        if len(indices) > 1:
+                            table_data['split_text'].append(indices)
+                        for r_idx, c_idx, text in indices:
+                            if all(s in text for s in ['<s>', '</s>']):
+                                table_data['superscript'].append((r_idx, c_idx, text))
+                            table.cells[r_idx][c_idx].add_text(text)
            score = get_score([[100, assignment_errors]])
            table_data['score'] = score

            if self.fill is not None:
                table = _fill_spanning(table, fill=self.fill[table_no])
            ar = table.get_list()
-            if self.headers is not None and self.headers[table_no] != ['']:
-                ar.insert(0, self.headers[table_no])
+            ar = remove_empty(ar)
            ar = encode_list(ar)
            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
--- a/camelot/ocr.py
+++ b/camelot/ocr.py
@ -7,19 +7,18 @@ from PIL import Image

 from .table import Table
 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
-                      find_table_joints)
-from .utils import merge_close_values, encode_list
+                      find_table_joints, find_cuts)
+from .utils import merge_close_values, encode_list, remove_empty


-class OCR:
-    """Uses optical character recognition to get text out of image based pdfs.
-    Currently works only on pdfs with lines.
+class OCRLattice:
+    """Lattice, but for images.

    Parameters
    ----------
    table_area : list
        List of strings of the form x1,y1,x2,y2 where
-        (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDFMiner's
+        (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
        coordinate space, denoting table areas to analyze.
        (optional, default: None)

@ -27,12 +26,12 @@ class OCR:
        List of ints specifying m-tolerance parameters.
        (optional, default: [2])

-    blocksize: int
+    blocksize : int
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.
        (optional, default: 15)

-    threshold_constant: float
+    threshold_constant : float
        Constant subtracted from the mean or weighted mean
        (see the details below). Normally, it is positive but may be
        zero or negative as well.
@ -51,6 +50,10 @@ class OCR:
        element for image processing.
        (optional, default: 15)

+    iterations : int
+        Number of iterations for dilation.
+        (optional, default: 2)
+
    debug : string
        {'contour', 'line', 'joint', 'table'}
        Set to one of the above values to generate a matplotlib plot
@ -58,9 +61,9 @@ class OCR:
        (optional, default: None)
    """
    def __init__(self, table_area=None, mtol=[2], blocksize=15, threshold_constant=-2,
-                 dpi=300, lang="eng", scale=15, debug=None):
+                 dpi=300, lang="eng", scale=15, iterations=2, debug=None):

-        self.method = 'ocr'
+        self.method = 'ocrl'
        self.table_area = table_area
        self.mtol = mtol
        self.blocksize = blocksize
@ -69,11 +72,13 @@ class OCR:
        self.dpi = dpi
        self.lang = lang
        self.scale = scale
+        self.iterations = iterations
        self.debug = debug

    def get_tables(self, pdfname):
        if self.tool is None:
            return None
+
        bname, __ = os.path.splitext(pdfname)
        imagename = ''.join([bname, '.png'])

@ -91,9 +96,9 @@ class OCR:
        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
            c=self.threshold_constant)
        vmask, v_segments = find_lines(threshold, direction='vertical',
-            scale=self.scale)
+            scale=self.scale, iterations=self.iterations)
        hmask, h_segments = find_lines(threshold, direction='horizontal',
-            scale=self.scale)
+            scale=self.scale, iterations=self.iterations)

        if self.table_area is not None:
            areas = []
@ -154,6 +159,7 @@ class OCR:
            ar = table.get_list()
            ar.reverse()
            ar = encode_list(ar)
+            ar = remove_empty(ar)
            table_data['data'] = ar
            tables['table-{0}'.format(table_no + 1)] = table_data
            table_no += 1
@ -162,4 +168,142 @@ class OCR:
        if self.debug:
            return None

+        return page
+
+
+class OCRStream:
+    """Stream, but for images.
+
+    Parameters
+    ----------
+    table_area : list
+        List of strings of the form x1,y1,x2,y2 where
+        (x1, y1) -> left-top and (x2, y2) -> right-bottom in OpenCV's
+        coordinate space, denoting table areas to analyze.
+        (optional, default: None)
+
+    columns : list
+        List of strings where each string is comma-separated values of
+        x-coordinates in OpenCV's coordinate space.
+        (optional, default: None)
+
+    blocksize : int
+        Size of a pixel neighborhood that is used to calculate a
+        threshold value for the pixel: 3, 5, 7, and so on.
+        (optional, default: 15)
+
+    threshold_constant : float
+        Constant subtracted from the mean or weighted mean
+        (see the details below). Normally, it is positive but may be
+        zero or negative as well.
+        (optional, default: -2)
+
+    line_threshold : int
+        Maximum intensity of projections on y-axis.
+        (optional, default: 100)
+
+    dpi : int
+        Dots per inch.
+        (optional, default: 300)
+
+    lang : string
+        Language to be used for OCR.
+        (optional, default: 'eng')
+    """
+    def __init__(self, table_area=None, columns=None, blocksize=15,
+                 threshold_constant=-2, line_threshold=100, dpi=300, lang="eng",
+                 debug=False):
+
+        self.method = 'ocrs'
+        self.table_area = table_area
+        self.columns = columns
+        self.blocksize = blocksize
+        self.threshold_constant = threshold_constant
+        self.line_threshold = line_threshold
+        self.tool = pyocr.get_available_tools()[0] # fix this
+        self.dpi = dpi
+        self.lang = lang
+        self.debug = debug
+
+    def get_tables(self, pdfname):
+        if self.tool is None:
+            return None
+
+        bname, __ = os.path.splitext(pdfname)
+        imagename = ''.join([bname, '.png'])
+
+        gs_call = [
+            "-q", "-sDEVICE=png16m", "-o", imagename, "-r{0}".format(self.dpi),
+            pdfname
+        ]
+        if "ghostscript" in subprocess.check_output(["gs", "-version"]).lower():
+            gs_call.insert(0, "gs")
+        else:
+            gs_call.insert(0, "gsc")
+        subprocess.call(gs_call, stdout=open(os.devnull, 'w'),
+            stderr=subprocess.STDOUT)
+
+        img, threshold = adaptive_threshold(imagename, blocksize=self.blocksize,
+            c=self.threshold_constant)
+        height, width = threshold.shape
+        if self.debug:
+            self.debug_images = img
+            return None
+
+        if self.table_area is not None:
+            if self.columns is not None:
+                if len(self.table_area) != len(self.columns):
+                    raise ValueError("Length of table area and columns should be equal.")
+
+            table_bbox = {}
+            for area in self.table_area:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = int(x1)
+                y1 = int(y1)
+                x2 = int(x2)
+                y2 = int(y2)
+                table_bbox[(x1, y1, x2, y2)] = None
+        else:
+            table_bbox = {(0, 0, width, height): None}
+
+        page = {}
+        tables = {}
+        table_no = 0
+        for k in sorted(table_bbox.keys(), key=lambda x: x[1]):
+            if self.columns is None:
+                raise NotImplementedError
+            else:
+                table_data = {}
+                table_image = threshold[k[1]:k[3],k[0]:k[2]]
+                cols = self.columns[table_no].split(',')
+                cols = [float(c) for c in cols]
+                cols.insert(0, k[0])
+                cols.append(k[2])
+                cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
+                y_cuts = find_cuts(table_image, line_threshold=self.line_threshold)
+                rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
+                table = Table(cols, rows)
+                for i in range(len(table.cells)):
+                    for j in range(len(table.cells[i])):
+                        x1 = int(table.cells[i][j].x1)
+                        y1 = int(table.cells[i][j].y1)
+                        x2 = int(table.cells[i][j].x2)
+                        y2 = int(table.cells[i][j].y2)
+                        table.cells[i][j].image = table_image[y1:y2,x1:x2]
+                        cell_image = Image.fromarray(table.cells[i][j].image)
+                        text = self.tool.image_to_string(
+                            cell_image,
+                            lang=self.lang,
+                            builder=pyocr.builders.TextBuilder()
+                        )
+                        table.cells[i][j].add_text(text)
+                ar = table.get_list()
+                ar.reverse()
+                ar = encode_list(ar)
+                ar = remove_empty(ar)
+                table_data['data'] = ar
+                tables['table-{0}'.format(table_no + 1)] = table_data
+                table_no += 1
+        page[os.path.basename(bname)] = tables
+
        return page
--- a/camelot/pdf.py
+++ b/camelot/pdf.py
@ -141,11 +141,14 @@ class Pdf:
                if self.extractor.method == 'stream':
                    self.debug = self.extractor.debug
                    self.debug_text = []
-                elif self.extractor.method in ['lattice', 'ocr']:
+                elif self.extractor.method in ['lattice', 'ocrl']:
                    self.debug = self.extractor.debug
                    self.debug_images = []
                    self.debug_segments = []
                    self.debug_tables = []
+                elif self.extractor.method == 'ocrs':
+                    self.debug = self.extractor.debug
+                    self.debug_images = []
            for p in pages:
                table = self.extractor.get_tables(p)
                if table is not None:
@ -157,6 +160,8 @@ class Pdf:
                        self.debug_images.append(self.extractor.debug_images)
                        self.debug_segments.append(self.extractor.debug_segments)
                        self.debug_tables.append(self.extractor.debug_tables)
+                    elif self.extractor.method == 'ocrs':
+                        self.debug_images.append(self.extractor.debug_images)
        if self.clean:
            self.remove_tempdir()
        return tables
@ -175,7 +180,7 @@ class Pdf:
        import matplotlib.patches as patches

        if self.debug is True:
-            try:
+            if hasattr(self, 'debug_text'):
                for text in self.debug_text:
                    fig = plt.figure()
                    ax = fig.add_subplot(111, aspect='equal')
@ -193,8 +198,10 @@ class Pdf:
                    ax.set_xlim(min(xs) - 10, max(xs) + 10)
                    ax.set_ylim(min(ys) - 10, max(ys) + 10)
                    plt.show()
-            except AttributeError:
-                raise ValueError("This option only be used with Stream.")
+            elif hasattr(self, 'debug_images'):
+                for img in self.debug_images:
+                    plt.imshow(img)
+                    plt.show()
        elif self.debug == 'contour':
            try:
                for img, table_bbox in self.debug_images:
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -236,10 +236,6 @@ class Stream:
        x-coordinates in PDFMiner's coordinate space.
        (optional, default: None)

-    headers : list
-        List of strings where each string is a csv header for a table.
-        (optional, default: None)
-
    ytol : list
        List of ints specifying the y-tolerance parameters.
        (optional, default: [2])
@ -268,14 +264,13 @@ class Stream:
        LTTextLineHorizontals in order to select table_area, columns.
        (optional, default: False)
    """
-    def __init__(self, table_area=None, columns=None, headers=None,
-                 ytol=[2], mtol=[0], margins=(1.0, 0.5, 0.1),
-                 split_text=False, flag_size=True, debug=False):
+    def __init__(self, table_area=None, columns=None, ytol=[2], mtol=[0],
+                 margins=(1.0, 0.5, 0.1), split_text=False, flag_size=True,
+                 debug=False):

        self.method = 'stream'
        self.table_area = table_area
        self.columns = columns
-        self.headers = headers
        self.ytol = ytol
        self.mtol = mtol
        self.char_margin, self.line_margin, self.word_margin = margins
@ -312,14 +307,12 @@ class Stream:
            self.debug_text = []
            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlh])
            self.debug_text.extend([(t.x0, t.y0, t.x1, t.y1) for t in lttextlv])
+            return None

        if self.table_area is not None:
            if self.columns is not None:
                if len(self.table_area) != len(self.columns):
-                    raise ValueError("Length of columns should be equal to table_area.")
-            if self.headers is not None:
-                if len(self.table_area) != len(self.headers):
-                    raise ValueError("Length of headers should be equal to table_area.")
+                    raise ValueError("Length of table area and columns should be equal.")

            table_bbox = {}
            for area in self.table_area:
@ -336,6 +329,7 @@ class Stream:
            ytolerance = copy.deepcopy(self.ytol) * len(table_bbox)
        else:
            ytolerance = copy.deepcopy(self.ytol)
+
        if len(self.mtol) == 1 and self.mtol[0] == 0:
            mtolerance = copy.deepcopy(self.mtol) * len(table_bbox)
        else:
@ -374,7 +368,7 @@ class Stream:
                guess = True
                ncols = max(set(elements), key=elements.count)
                len_non_mode = len(filter(lambda x: x != ncols, elements))
-                if ncols == 1 and not self.debug:
+                if ncols == 1:
                    # no tables detected
                    logger.warning("{}: Only one column was detected, the pdf"
                                   " may have no tables.".format(
@ -396,15 +390,6 @@ class Stream:
                cols = _add_columns(cols, inner_text, ytolerance[table_no])
                cols = _join_columns(cols, text_x_min, text_x_max)

-            if self.headers is not None and self.headers[table_no] != [""]:
-                self.headers[table_no] = self.headers[table_no].split(',')
-                if len(self.headers[table_no]) != len(cols):
-                    logger.warning("Length of header ({0}) specified for table is not"
-                                   " equal to the number of columns ({1}) detected.".format(
-                                   len(self.headers[table_no]), len(cols)))
-                while len(self.headers[table_no]) != len(cols):
-                    self.headers[table_no].append('')
-
            table = Table(cols, rows)
            table = table.set_all_edges()
            assignment_errors = []
@ -429,8 +414,6 @@ class Stream:

            table_data['score'] = score
            ar = table.get_list()
-            if self.headers is not None and self.headers[table_no] != ['']:
-                ar.insert(0, self.headers[table_no])
            ar = encode_list(ar)
            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
--- a/camelot/table.py
+++ b/camelot/table.py
@ -188,38 +188,32 @@ class Table:
                bound = self.cells[r][c].get_bounded_edges()
                if bound == 4:
                    continue
-
                elif bound == 3:
                    if not self.cells[r][c].left:
                        if (self.cells[r][c].right and
                                self.cells[r][c].top and
                                self.cells[r][c].bottom):
                            self.cells[r][c].spanning_h = True
-
                    elif not self.cells[r][c].right:
                        if (self.cells[r][c].left and
                                self.cells[r][c].top and
                                self.cells[r][c].bottom):
                            self.cells[r][c].spanning_h = True
-
                    elif not self.cells[r][c].top:
                        if (self.cells[r][c].left and
                                self.cells[r][c].right and
                                self.cells[r][c].bottom):
                            self.cells[r][c].spanning_v = True
-
                    elif not self.cells[r][c].bottom:
                        if (self.cells[r][c].left and
                                self.cells[r][c].right and
                                self.cells[r][c].top):
                            self.cells[r][c].spanning_v = True
-
                elif bound == 2:
                    if self.cells[r][c].left and self.cells[r][c].right:
                        if (not self.cells[r][c].top and
                                not self.cells[r][c].bottom):
                            self.cells[r][c].spanning_v = True
-
                    elif self.cells[r][c].top and self.cells[r][c].bottom:
                        if (not self.cells[r][c].left and
                                not self.cells[r][c].right):
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -426,40 +426,43 @@ def split_textline(table, textline, direction, flag_size=True):
    idx = 0
    cut_text = []
    bbox = textline.bbox
-    if direction == 'horizontal' and not textline.is_empty():
-        x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
-        r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
-        r = r_idx[0]
-        x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
-        if not x_cuts:
-            x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
-        for obj in textline._objs:
-            row = table.rows[r]
-            for cut in x_cuts:
-                if isinstance(obj, LTChar):
-                    if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
-                            (obj.x0 + obj.x1) / 2 <= cut[1]):
+    try:
+        if direction == 'horizontal' and not textline.is_empty():
+            x_overlap = [i for i, x in enumerate(table.cols) if x[0] <= bbox[2] and bbox[0] <= x[1]]
+            r_idx = [j for j, r in enumerate(table.rows) if r[1] <= (bbox[1] + bbox[3]) / 2 <= r[0]]
+            r = r_idx[0]
+            x_cuts = [(c, table.cells[r][c].x2) for c in x_overlap if table.cells[r][c].right]
+            if not x_cuts:
+                x_cuts = [(x_overlap[0], table.cells[r][-1].x2)]
+            for obj in textline._objs:
+                row = table.rows[r]
+                for cut in x_cuts:
+                    if isinstance(obj, LTChar):
+                        if (row[1] <= (obj.y0 + obj.y1) / 2 <= row[0] and
+                                (obj.x0 + obj.x1) / 2 <= cut[1]):
+                            cut_text.append((r, cut[0], obj))
+                            break
+                    elif isinstance(obj, LTAnno):
                        cut_text.append((r, cut[0], obj))
-                        break
-                elif isinstance(obj, LTAnno):
-                    cut_text.append((r, cut[0], obj))
-    elif direction == 'vertical' and not textline.is_empty():
-        y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
-        c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
-        c = c_idx[0]
-        y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
-        if not y_cuts:
-            y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
-        for obj in textline._objs:
-            col = table.cols[c]
-            for cut in y_cuts:
-                if isinstance(obj, LTChar):
-                    if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
-                            (obj.y0 + obj.y1) / 2 >= cut[1]):
+        elif direction == 'vertical' and not textline.is_empty():
+            y_overlap = [j for j, y in enumerate(table.rows) if y[1] <= bbox[3] and bbox[1] <= y[0]]
+            c_idx = [i for i, c in enumerate(table.cols) if c[0] <= (bbox[0] + bbox[2]) / 2 <= c[1]]
+            c = c_idx[0]
+            y_cuts = [(r, table.cells[r][c].y1) for r in y_overlap if table.cells[r][c].bottom]
+            if not y_cuts:
+                y_cuts = [(y_overlap[0], table.cells[-1][c].y1)]
+            for obj in textline._objs:
+                col = table.cols[c]
+                for cut in y_cuts:
+                    if isinstance(obj, LTChar):
+                        if (col[0] <= (obj.x0 + obj.x1) / 2 <= col[1] and
+                                (obj.y0 + obj.y1) / 2 >= cut[1]):
+                            cut_text.append((cut[0], c, obj))
+                            break
+                    elif isinstance(obj, LTAnno):
                        cut_text.append((cut[0], c, obj))
-                        break
-                elif isinstance(obj, LTAnno):
-                    cut_text.append((cut[0], c, obj))
+    except IndexError:
+        return [(-1, -1, textline.get_text())]
    grouped_chars = []
    for key, chars in groupby(cut_text, itemgetter(0, 1)):
        if flag_size:
--- a/tools/camelot
+++ b/tools/camelot
@ -18,7 +18,7 @@ from PyPDF2 import PdfFileReader
 from camelot.pdf import Pdf
 from camelot.lattice import Lattice
 from camelot.stream import Stream
-from camelot.ocr import OCR
+from camelot.ocr import OCRLattice, OCRStream
 from camelot import utils


@ -54,7 +54,8 @@ options:
 camelot methods:
 lattice  Looks for lines between data.
 stream   Looks for spaces between data.
- ocr      Looks for lines in image based pdfs.
+ ocrl     Lattice, but for images.
+ ocrs     Stream, but for images.

 See 'camelot <method> -h' for more information on a specific method.
 """
@ -63,20 +64,22 @@ lattice_doc = """
 Lattice method looks for lines between text to form a table.

 usage:
- camelot lattice [-t <tarea>...] [-F <fill>...] [-H <header>...]
- [-m <mtol>...] [options] [--] <file>
+ camelot lattice [-t <tarea>...] [-F <fill>...] [-m <mtol>...]
+ [-j <jtol>...] [options] [--] <file>

 options:
 -t, --tarea <tarea>            Specific table areas to analyze.
 -F, --fill <fill>              Fill data in horizontal and/or vertical spanning
                                cells. Example: -F h, -F v, -F hv
- -H, --header <header>          Specify header for each table.
 -m, --mtol <mtol>              Tolerance to account for when merging lines
                                which are very close. [default: 2]
+ -j, --jtol <jtol>              Tolerance to account for when matching line endings
+                                with intersections. [default: 2]
 -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
- -c, --constant <constant>      See adaptive threshold doc. [default: -2]
+ -C, --constant <constant>      See adaptive threshold doc. [default: -2]
 -s, --scale <scale>            Scaling factor. Large scaling factor leads to
                                smaller lines being detected. [default: 15]
+ -I, --iterations <iterations>  Number of iterations for dilation. [default: 2]
 -i, --invert                   Invert pdf image to make sure that lines are
                                in foreground.
 -T, --shift_text <shift_text>  Specify where the text in a spanning cell
@ -89,41 +92,61 @@ stream_doc = """
 Stream method looks for whitespaces between text to form a table.

 usage:
- camelot stream [-t <tarea>...] [-c <columns>...] [-H <header>...]
- [-y <ytol>...] [-m <mtol>...] [options] [--] <file>
+ camelot stream [-t <tarea>...] [-c <columns>...] [-m <mtol>...]
+ [-y <ytol>...] [options] [--] <file>

 options:
 -t, --tarea <tarea>      Specific table areas to analyze.
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
- -H, --header <header>    Specify header for each table.
- -y, --ytol <ytol>        Tolerance to account for when grouping rows
-                          together. [default: 2]
 -m, --mtol <mtol>        Tolerance to account for when merging columns
                          together. [default: 0]
+ -y, --ytol <ytol>        Tolerance to account for when grouping rows
+                          together. [default: 2]
 -d, --debug              Debug by visualizing textboxes.
 """


-ocr_doc = """
-OCR method looks for lines in image based pdfs.
+ocrl_doc = """
+Lattice, but for images.

 usage:
- camelot ocr [-t <tarea>] [-m <mtol>] [options] [--] <file>
+ camelot ocrl [-t <tarea>...] [-m <mtol>...] [options] [--] <file>

 options:
- -t, --tarea <tarea>          Specific table areas to analyze.
- -m, --mtol <mtol>            Tolerance to account for when merging lines
-                              which are very close. [default: 2]
- -b, --blocksize <blocksize>  See adaptive threshold doc. [default: 15]
- -c, --constant <constant>    See adaptive threshold doc. [default: -2]
- -D, --dpi <dpi>              Dots per inch, specify image quality to be used for OCR.
-                              [default: 300]
- -l, --lang <lang>            Specify language to be used for OCR. [default: eng]
- -s, --scale <scale>          Scaling factor. Large scaling factor leads to
-                              smaller lines being detected. [default: 15]
- -d, --debug <debug>          Debug by visualizing pdf geometry.
-                              (contour,line,joint,table) Example: -d table
+ -t, --tarea <tarea>            Specific table areas to analyze.
+ -m, --mtol <mtol>              Tolerance to account for when merging lines
+                                which are very close. [default: 2]
+ -b, --blocksize <blocksize>    See adaptive threshold doc. [default: 15]
+ -C, --constant <constant>      See adaptive threshold doc. [default: -2]
+ -D, --dpi <dpi>                Dots per inch, specify image quality to be used for OCR.
+                                [default: 300]
+ -l, --lang <lang>              Specify language to be used for OCR. [default: eng]
+ -s, --scale <scale>            Scaling factor. Large scaling factor leads to
+                                smaller lines being detected. [default: 15]
+ -I, --iterations <iterations>  Number of iterations for dilation. [default: 2]
+ -d, --debug <debug>            Debug by visualizing pdf geometry.
+                                (contour,line,joint,table) Example: -d table
+"""
+
+ocrs_doc = """
+Stream, but for images.
+
+usage:
+ camelot ocrs [-t <tarea>...] [-c <columns>...] [options] [--] <file>
+
+options:
+ -t, --tarea <tarea>                    Specific table areas to analyze.
+ -c, --columns <columns>                Comma-separated list of column x-coordinates.
+                                        Example: -c 10.1,20.2,30.3
+ -b, --blocksize <blocksize>            See adaptive threshold doc. [default: 15]
+ -C, --constant <constant>              See adaptive threshold doc. [default: -2]
+ -N, --line-threshold <line_threshold>  Maximum intensity of projections on y-axis.
+                                        [default: 100]
+ -D, --dpi <dpi>                        Dots per inch, specify image quality to be used for OCR.
+                                        [default: 300]
+ -l, --lang <lang>                      Specify language to be used for OCR. [default: eng]
+ -d, --debug                            Debug by visualizing image.
 """


@ -351,8 +374,10 @@ if __name__ == '__main__':
        args.update(docopt(lattice_doc, argv=argv))
    elif args['<method>'] == 'stream':
        args.update(docopt(stream_doc, argv=argv))
-    elif args['<method>'] == 'ocr':
-        args.update(docopt(ocr_doc, argv=argv))
+    elif args['<method>'] == 'ocrl':
+        args.update(docopt(ocrl_doc, argv=argv))
+    elif args['<method>'] == 'ocrs':
+        args.update(docopt(ocrs_doc, argv=argv))

    filename = args['<file>']
    filedir = os.path.dirname(args['<file>'])
@ -392,11 +417,12 @@ if __name__ == '__main__':
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
                'fill': args['--fill'] if args['--fill'] else None,
-                'headers': args['--header'] if args['--header'] else None,
                'mtol': [int(m) for m in args['--mtol']],
+                'jtol': [int(j) for j in args['--jtol']],
                'blocksize': int(args['--blocksize']),
                'threshold_constant': float(args['--constant']),
                'scale': int(args['--scale']),
+                'iterations': int(args['--iterations']),
                'invert': args['--invert'],
                'margins': margins,
                'split_text': args['--split_text'],
@ -462,7 +488,6 @@ if __name__ == '__main__':
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
                'columns': args['--columns'] if args['--columns'] else None,
-                'headers': args['--header'] if args['--header'] else None,
                'ytol': [int(y) for y in args['--ytol']],
                'mtol': [int(m) for m in args['--mtol']],
                'margins': margins,
@ -522,7 +547,7 @@ if __name__ == '__main__':
        except Exception as e:
            logger.exception(e.message, exc_info=True)
            sys.exit()
-    elif args['<method>'] == 'ocr':
+    elif args['<method>'] == 'ocrl':
        try:
            kwargs = {
                'table_area': args['--tarea'] if args['--tarea'] else None,
@ -532,9 +557,75 @@ if __name__ == '__main__':
                'dpi': int(args['--dpi']),
                'lang': args['--lang'],
                'scale': int(args['--scale']),
+                'iterations': int(args['--iterations']),
                'debug': args['--debug']
            }
-            manager = Pdf(OCR(**kwargs), filename, pagenos=p, clean=True,
+            manager = Pdf(OCRLattice(**kwargs), filename, pagenos=p, clean=True,
+                          parallel=args['--parallel'])
+            data = manager.extract()
+
+            processing_time = time.time() - start_time
+            logger.info("Finished processing in " + str(processing_time) + " seconds")
+
+            if args['--plot']:
+                if args['--output']:
+                    pngname = os.path.join(args['--output'], os.path.basename(pngname))
+                plot_type = args['--plot'].split(',')
+                if 'page' in plot_type:
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            plot_table_barchart(table['r_nempty_cells'],
+                                table['c_nempty_cells'],
+                                table['empty_p'],
+                                page_number,
+                                table_number)
+
+                if 'all' in plot_type:
+                    plot_all_barchart(data, pngname)
+
+                if 'rc' in plot_type:
+                    plot_rc_piechart(data, pngname)
+
+            if args['--print-stats']:
+                print_stats(data, processing_time)
+
+            if args['--save-stats']:
+                if args['--output']:
+                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
+                with open(scorename, 'w') as score_file:
+                    score_file.write('table,nrows,ncols,empty_p,line_p,text_p,score\n')
+                    for page_number in sorted(data.keys(), key=lambda x: int(x[5:])):
+                        page = data[page_number]
+                        for table_number in sorted(page.keys(), key=lambda x: int(x[6:])):
+                            table = page[table_number]
+                            score_file.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
+                                ''.join([page_number, '_', table_number]),
+                                table['nrows'],
+                                table['ncols'],
+                                table['empty_p'],
+                                table['line_p'],
+                                table['text_p'],
+                                table['score']))
+            if args['--debug']:
+                manager.debug_plot()
+        except Exception as e:
+            logger.exception(e.message, exc_info=True)
+            sys.exit()
+    elif args['<method>'] == 'ocrs':
+        try:
+            kwargs = {
+                'table_area': args['--tarea'] if args['--tarea'] else None,
+                'columns': args['--columns'] if args['--columns'] else None,
+                'blocksize': int(args['--blocksize']),
+                'threshold_constant': float(args['--constant']),
+                'line_threshold': int(args['--line-threshold']),
+                'dpi': int(args['--dpi']),
+                'lang': args['--lang'],
+                'debug': args['--debug']
+            }
+            manager = Pdf(OCRStream(**kwargs), filename, pagenos=p, clean=True,
                          parallel=args['--parallel'])
            data = manager.extract()

@ -588,7 +679,7 @@ if __name__ == '__main__':
            logger.exception(e.message, exc_info=True)
            sys.exit()

-    if args['--debug']:
+    if args.get('--debug') is not None and args['--debug']:
        print("See 'camelot <method> -h' for various parameters you can tweak.")
    else:
        output = filedir if args['--output'] is None else args['--output']