Add table_area

[MRG] Add table_area
2016-09-05 18:51:59 +05:30 · 2016-09-05 18:51:59 +05:30 · d86630e70b
parent 0bb6ce0bf9
commit d86630e70b
6 changed files with 343 additions and 296 deletions
--- a/camelot/hybrid.py
+++ b/camelot/hybrid.py
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@ -0,0 +1,98 @@
 import cv2
 import numpy as np
 def adaptive_threshold(imagename, invert=False):
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if invert:
        threshold = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
            15, -0.2)
    else:
        threshold = cv2.adaptiveThreshold(
            np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            15, -0.2)
    return img, threshold
 def find_lines(threshold, direction=None, scale=15):
    lines = []
    if direction == 'vertical':
        size = threshold.shape[0] // scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
    elif direction == 'horizontal':
        size = threshold.shape[1] // scale
        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
    elif direction is None:
        raise ValueError("Specify direction as either 'vertical' or"
                         " 'horizontal'")
    threshold = cv2.erode(threshold, el, (-1, -1))
    threshold = cv2.dilate(threshold, el, (-1, -1))
    dmask = threshold  # findContours modifies source image
    try:
        _, contours, _ = cv2.findContours(
            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        contours, _ = cv2.findContours(
            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        if direction == 'vertical':
            lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
        elif direction == 'horizontal':
            lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
    return dmask, lines
 def find_table_contours(vertical, horizontal):
    mask = vertical + horizontal
    try:
        __, contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    cont = []
    for c in contours:
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        cont.append((x, y, w, h))
    return cont
 def find_table_joints(contours, vertical, horizontal):
    joints = np.bitwise_and(vertical, horizontal)
    tables = {}
    for c in contours:
        x, y, w, h = c
        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than 4 joints
            continue
        joint_coords = []
        for j in jc:
            jx, jy, jw, jh = cv2.boundingRect(j)
            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords
    return tables
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -4,15 +4,15 @@ import types
 import copy_reg
 import logging
 import cv2
 import numpy as np
 from wand.image import Image
 from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
                      find_table_joints)
 from .table import Table
-from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values,
+from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
-                    get_row_index, get_column_index, get_score, reduce_index,
+                    detect_vertical, merge_close_values, get_row_index,
-                    outline, fill_spanning, count_empty, encode_list, pdf_to_text)
+                    get_column_index, get_score, reduce_index, outline,
                    fill_spanning, count_empty, encode_list, pdf_to_text)
 __all__ = ['Lattice']
@ -26,128 +26,6 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)
 def _morph_transform(imagename, scale=15, invert=False):
    """Morphological Transformation
    Applies a series of morphological operations on the image
    to find table contours and line segments.
    http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
    Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
    taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
    Parameters
    ----------
    imagename : Path to image.
    scale : int
        Scaling factor. Large scaling factor leads to smaller lines
        being detected. (optional, default: 15)
    invert : bool
        Invert pdf image to make sure that lines are in foreground.
        (optional, default: False)
    Returns
    -------
    img : ndarray
    tables : dict
        Dictionary with table bounding box as key and list of
        joints found in the table as value.
    v_segments : list
        List of vertical line segments found in the image.
    h_segments : list
        List of horizontal line segments found in the image.
    """
    img = cv2.imread(imagename)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    if invert:
        threshold = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
            15, -0.2)
    else:
        threshold = cv2.adaptiveThreshold(
            np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            15, -0.2)
    vertical = threshold
    horizontal = threshold
    verticalsize = vertical.shape[0] // scale
    horizontalsize = horizontal.shape[1] // scale
    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
    vertical = cv2.erode(vertical, ver, (-1, -1))
    vertical = cv2.dilate(vertical, ver, (-1, -1))
    horizontal = cv2.erode(horizontal, hor, (-1, -1))
    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
    mask = vertical + horizontal
    joints = np.bitwise_and(vertical, horizontal)
    try:
        __, contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        contours, __ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    tables = {}
    for c in contours:
        c_poly = cv2.approxPolyDP(c, 3, True)
        x, y, w, h = cv2.boundingRect(c_poly)
        roi = joints[y : y + h, x : x + w]
        try:
            __, jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        except ValueError:
            jc, __ = cv2.findContours(
                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        if len(jc) <= 4:  # remove contours with less than <=4 joints
            continue
        joint_coords = []
        for j in jc:
            jx, jy, jw, jh = cv2.boundingRect(j)
            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
            joint_coords.append((c1, c2))
        tables[(x, y + h, x + w, y)] = joint_coords
    v_segments, h_segments = [], []
    try:
        _, vcontours, _ = cv2.findContours(
            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        vcontours, _ = cv2.findContours(
            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for vc in vcontours:
        x, y, w, h = cv2.boundingRect(vc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
    try:
        _, hcontours, _ = cv2.findContours(
            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    except ValueError:
        hcontours, _ = cv2.findContours(
            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    for hc in hcontours:
        x, y, w, h = cv2.boundingRect(hc)
        x1, x2 = x, x + w
        y1, y2 = y, y + h
        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
    return img, tables, v_segments, h_segments
 class Lattice:
    """Lattice algorithm
@ -188,17 +66,17 @@ class Lattice:
        Dictionary with page number as key and list of tables on that
        page as value.
    """
-
+    def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15,
-    def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
+                 invert=False, margins=(2.0, 0.5, 0.1), debug=None):
                 invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
        self.method = 'lattice'
        self.table_area = table_area
        self.fill = fill
        self.scale = scale
        self.jtol = jtol
        self.mtol = mtol
        self.scale = scale
        self.invert = invert
-        self.char_margin, self.line_margin, self.word_margin = pdf_margin
+        self.char_margin, self.line_margin, self.word_margin = margins
        self.debug = debug
    def get_tables(self, pdfname):
@ -217,48 +95,79 @@ class Lattice:
            logging.warning("{0}: PDF has no text. It may be an image.".format(
                os.path.basename(bname)))
            return None
        imagename = ''.join([bname, '.png'])
        with Image(filename=pdfname, depth=8, resolution=300) as png:
            png.save(filename=imagename)
        img, threshold = adaptive_threshold(imagename, invert=self.invert)
        pdf_x = width
        pdf_y = height
        img, table_bbox, v_segments, h_segments = _morph_transform(
            imagename, scale=self.scale, invert=self.invert)
        img_x = img.shape[1]
        img_y = img.shape[0]
-        scaling_factor_x = pdf_x / float(img_x)
+        sc_x_image = img_x / float(pdf_x)
-        scaling_factor_y = pdf_y / float(img_y)
+        sc_y_image = img_y / float(pdf_y)
        sc_x_pdf = pdf_x / float(img_x)
        sc_y_pdf = pdf_y / float(img_y)
        factors_image = (sc_x_image, sc_y_image, pdf_y)
        factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
        vmask, v_segments = find_lines(threshold, direction='vertical',
            scale=self.scale)
        hmask, h_segments = find_lines(threshold, direction='horizontal',
            scale=self.scale)
        if self.table_area:
            if self.fill:
                if len(self.table_area) != len(self.fill):
                    raise ValueError("message")
            if len(self.jtol) == 1 and self.jtol[0] == 2:
                self.jtol = self.jtol * len(self.table_area)
            if len(self.mtol) == 1 and self.mtol[0] == 2:
                self.mtol = self.mtol * len(self.table_area)
            areas = []
            for area in self.table_area:
                x1, y1, x2, y2 = area.split(",")
                x1 = int(x1)
                y1 = int(y1)
                x2 = int(x2)
                y2 = int(y2)
                x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
            table_bbox = find_table_joints(areas, vmask, hmask)
        else:
            contours = find_table_contours(vmask, hmask)
            table_bbox = find_table_joints(contours, vmask, hmask)
        if self.debug:
            self.debug_images = (img, table_bbox)
-        factors = (scaling_factor_x, scaling_factor_y, img_y)
+        table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
-        table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
+            h_segments, factors_pdf)
                                                       h_segments, factors)
        if self.debug:
            self.debug_segments = (v_segments, h_segments)
            self.debug_tables = []
-        pdf_page = {}
+        page = {}
-        page_tables = {}
+        tables = {}
-        table_no = 1
+        table_no = 0
        # sort tables based on y-coord
        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
-            # select edges which lie within table_bbox
+            # select elements which lie within table_bbox
-            table_info = {}
+            table_data = {}
            v_s, h_s = segments_bbox(k, v_segments, h_segments)
            t_bbox = text_bbox(k, text)
-            table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
+            table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
            table_rotation = detect_vertical(t_bbox)
            cols, rows = zip(*table_bbox[k])
            cols, rows = list(cols), list(rows)
            cols.extend([k[0], k[2]])
            rows.extend([k[1], k[3]])
            # sort horizontal and vertical segments
-            cols = merge_close_values(sorted(cols), mtol=self.mtol)
+            cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
            rows = merge_close_values(
-                sorted(rows, reverse=True), mtol=self.mtol)
+                sorted(rows, reverse=True), mtol=self.mtol[table_no])
            # make grid using x and y coord of shortlisted rows and cols
            cols = [(cols[i], cols[i + 1])
                    for i in range(0, len(cols) - 1)]
@ -266,9 +175,9 @@ class Lattice:
                    for i in range(0, len(rows) - 1)]
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
-            table = table.set_edges(v_s, h_s, jtol=self.jtol)
+            table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no])
            nouse = table.nocont_ / (len(v_s) + len(h_s))
-            table_info['line_p'] = 100 * (1 - nouse)
+            table_data['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
@ -314,10 +223,10 @@ class Lattice:
                        for t in t_bbox]))
            score = get_score([[50, rerror], [50, cerror]])
-            table_info['score'] = score
+            table_data['score'] = score
-            if self.fill is not None:
+            if self.fill:
-                table = fill_spanning(table, fill=self.fill)
+                table = fill_spanning(table, fill=self.fill[table_no])
            ar = table.get_list()
            if table_rotation == 'left':
                ar = zip(*ar[::-1])
@ -325,18 +234,18 @@ class Lattice:
                ar = zip(*ar[::1])
                ar.reverse()
            ar = encode_list(ar)
-            table_info['data'] = ar
+            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
-            table_info['empty_p'] = empty_p
+            table_data['empty_p'] = empty_p
-            table_info['r_nempty_cells'] = r_nempty_cells
+            table_data['r_nempty_cells'] = r_nempty_cells
-            table_info['c_nempty_cells'] = c_nempty_cells
+            table_data['c_nempty_cells'] = c_nempty_cells
-            table_info['nrows'] = len(ar)
+            table_data['nrows'] = len(ar)
-            table_info['ncols'] = len(ar[0])
+            table_data['ncols'] = len(ar[0])
-            page_tables['table_{0}'.format(table_no)] = table_info
+            tables['table-{0}'.format(table_no + 1)] = table_data
            table_no += 1
-        pdf_page[os.path.basename(bname)] = page_tables
+        page[os.path.basename(bname)] = tables
        if self.debug:
            return None
-        return pdf_page
+        return page
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -7,7 +7,8 @@ import logging
 import numpy as np
 from .table import Table
-from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
+from .utils import (get_row_index, get_score, count_empty, encode_list,
                    pdf_to_text, text_bbox)
 __all__ = ['Stream']
@ -133,6 +134,17 @@ def _get_column_index(t, columns):
    return c_idx, error
 def _join_rows(rows_grouped, text_y_max, text_y_min):
    row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
                if len(r) > 0 else 0 for r in rows_grouped]
    rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
    rows.insert(0, text_y_max)
    rows.append(text_y_min)
    rows = [(rows[i], rows[i + 1])
            for i in range(0, len(rows) - 1)]
    return rows
 def _add_columns(cols, text, ytolerance):
    if text:
        text = _group_rows(text, ytol=ytolerance)
@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance):
    return cols
 def _get_table_bounds(rows):
    x0 = min([t.x0 for r in rows for t in r])
    x1 = max([t.x1 for r in rows for t in r])
    y0 = min([t.y0 for t in rows[-1]])
    y1 = max([t.y1 for t in rows[0]])
    return x0, x1, y0, y1
 def _join_columns(cols, text_x_min, text_x_max):
    cols = sorted(cols)
    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
@ -194,16 +198,16 @@ class Stream:
        Dictionary with page number as key and list of tables on that
        page as value.
    """
-
+    def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
-    def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
+                 mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):
                 pdf_margin=(2.0, 0.5, 0.1), debug=False):
        self.method = 'stream'
-        self.ncolumns = ncolumns
+        self.table_area = table_area
        self.columns = columns
        self.ncolumns = ncolumns
        self.ytol = ytol
        self.mtol = mtol
-        self.char_margin, self.line_margin, self.word_margin = pdf_margin
+        self.char_margin, self.line_margin, self.word_margin = margins
        self.debug = debug
    def get_tables(self, pdfname):
@ -222,106 +226,126 @@ class Stream:
            logging.warning("{0}: PDF has no text. It may be an image.".format(
                os.path.basename(bname)))
            return None
        text.sort(key=lambda x: (-x.y0, x.x0))
        if self.debug:
            self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
            return None
-        rows_grouped = _group_rows(text, ytol=self.ytol)
+        if self.table_area:
-        elements = [len(r) for r in rows_grouped]
+            if self.columns:
-        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
+                if len(self.table_area) != len(self.columns):
-                    if len(r) > 0 else 0 for r in rows_grouped]
+                    raise ValueError("message")
        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
        bounds = _get_table_bounds(rows_grouped)
        rows.insert(0, bounds[3])
        rows.append(bounds[2])
        rows = [(rows[i], rows[i + 1])
                for i in range(0, len(rows) - 1)]
        guess = False
        if self.columns:
            # user has to input boundary columns too
            # take (0, width) by default
            # similar to else condition
            # len can't be 1
            cols = self.columns.split(',')
            cols = [(float(cols[i]), float(cols[i + 1]))
                    for i in range(0, len(cols) - 1)]
        else:
            if self.ncolumns:
-                ncols = self.ncolumns
+                if len(self.table_area) != len(self.ncolumns):
-                cols = [(t.x0, t.x1)
+                    raise ValueError("message")
-                    for r in rows_grouped if len(r) == ncols for t in r]
+            if len(self.ytol) == 1 and self.ytol[0] == 2:
-                cols = _merge_columns(sorted(cols), mtol=self.mtol)
+                self.ytol = self.ytol * len(self.table_area)
-                if len(cols) != self.ncolumns:
+            if len(self.mtol) == 1 and self.mtol[0] == 2:
-                    logging.warning("{}: The number of columns after merge"
+                self.mtol = self.mtol * len(self.table_area)
-                                  " isn't the same as what you specified."
+            table_bbox = {}
-                                  " Change the value of mtol.".format(
+            for area in self.table_area:
-                                  os.path.basename(bname)))
+                x1, y1, x2, y2 = area.split(",")
-                cols = _join_columns(cols, bounds[0], bounds[1])
+                x1 = int(x1)
-            else:
+                y1 = int(y1)
-                guess = True
+                x2 = int(x2)
-                ncols = max(set(elements), key=elements.count)
+                y2 = int(y2)
-                len_non_mode = len(filter(lambda x: x != ncols, elements))
+                table_bbox[(x1, y2, x2, y1)] = None
                if ncols == 1 and not self.debug:
                    # no tables detected
                    logging.warning("{}: Only one column was detected, the PDF"
                                  " may have no tables. Specify ncols if"
                                  " the PDF has tables.".format(
                                  os.path.basename(bname)))
                cols = [(t.x0, t.x1)
                    for r in rows_grouped if len(r) == ncols for t in r]
                cols = _merge_columns(sorted(cols), mtol=self.mtol)
                inner_text = []
                for i in range(1, len(cols)):
                    left = cols[i - 1][1]
                    right = cols[i][0]
                    inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
                outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                inner_text.extend(outer_text)
                cols = _add_columns(cols, inner_text, self.ytol)
                cols = _join_columns(cols, bounds[0], bounds[1])
        pdf_page = {}
        page_tables = {}
        table_info = {}
        table = Table(cols, rows)
        rerror = []
        cerror = []
        for row in rows_grouped:
            for t in row:
                try:
                    r_idx, rass_error = get_row_index(t, rows)
                except ValueError as e:
                    # couldn't assign LTTextLH to any cell
                    vprint(e.message)
                    continue
                try:
                    c_idx, cass_error = _get_column_index(t, cols)
                except ValueError as e:
                    # couldn't assign LTTextLH to any cell
                    vprint(e.message)
                    continue
                rerror.append(rass_error)
                cerror.append(cass_error)
                table.cells[r_idx][c_idx].add_text(
                    t.get_text().strip('\n'))
        if guess:
            score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
        else:
-            score = get_score([[50, rerror], [50, cerror]])
+            table_bbox = {(0, height, width, 0): None}
        table_info['score'] = score
        ar = table.get_list()
        ar = encode_list(ar)
        table_info['data'] = ar
        empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
        table_info['empty_p'] = empty_p
        table_info['r_nempty_cells'] = r_nempty_cells
        table_info['c_nempty_cells'] = c_nempty_cells
        table_info['nrows'] = len(ar)
        table_info['ncols'] = len(ar[0])
        page_tables['table_1'] = table_info
        pdf_page[os.path.basename(bname)] = page_tables
-        return pdf_page
+        page = {}
        tables = {}
        table_no = 0
        # sort tables based on y-coord
        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
            # select elements which lie within table_bbox
            table_data = {}
            t_bbox = text_bbox(k, text)
            t_bbox.sort(key=lambda x: (-x.y0, x.x0))
            rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
            rows = _join_rows(rows_grouped, k[3], k[1])
            elements = [len(r) for r in rows_grouped]
            guess = False
            if self.columns and self.columns[table_no] != "":
                # user has to input boundary columns too
                # take (0, width) by default
                # similar to else condition
                # len can't be 1
                cols = self.columns[table_no].split(',')
                cols = [(float(cols[i]), float(cols[i + 1]))
                        for i in range(0, len(cols) - 1)]
            else:
                if self.ncolumns and self.ncolumns[table_no] != -1:
                    ncols = self.ncolumns[table_no]
                    cols = [(t.x0, t.x1)
                        for r in rows_grouped if len(r) == ncols for t in r]
                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
                    if len(cols) != self.ncolumns[table_no]:
                        logging.warning("{}: The number of columns after merge"
                                      " isn't the same as what you specified."
                                      " Change the value of mtol.".format(
                                      os.path.basename(bname)))
                    cols = _join_columns(cols, k[0], k[2])
                else:
                    guess = True
                    ncols = max(set(elements), key=elements.count)
                    len_non_mode = len(filter(lambda x: x != ncols, elements))
                    if ncols == 1 and not self.debug:
                        # no tables detected
                        logging.warning("{}: Only one column was detected, the PDF"
                                      " may have no tables. Specify ncols if"
                                      " the PDF has tables.".format(
                                      os.path.basename(bname)))
                    cols = [(t.x0, t.x1)
                        for r in rows_grouped if len(r) == ncols for t in r]
                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
                    inner_text = []
                    for i in range(1, len(cols)):
                        left = cols[i - 1][1]
                        right = cols[i][0]
                        inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
                    outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
                    inner_text.extend(outer_text)
                    cols = _add_columns(cols, inner_text, self.ytol[table_no])
                    cols = _join_columns(cols, k[0], k[2])
            table = Table(cols, rows)
            rerror = []
            cerror = []
            for row in rows_grouped:
                for t in row:
                    try:
                        r_idx, rass_error = get_row_index(t, rows)
                    except ValueError as e:
                        # couldn't assign LTTextLH to any cell
                        continue
                    try:
                        c_idx, cass_error = _get_column_index(t, cols)
                    except ValueError as e:
                        # couldn't assign LTTextLH to any cell
                        continue
                    rerror.append(rass_error)
                    cerror.append(cass_error)
                    table.cells[r_idx][c_idx].add_text(
                        t.get_text().strip('\n'))
            if guess:
                score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
            else:
                score = get_score([[50, rerror], [50, cerror]])
            table_data['score'] = score
            ar = encode_list(table.get_list())
            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
            table_data['empty_p'] = empty_p
            table_data['r_nempty_cells'] = r_nempty_cells
            table_data['c_nempty_cells'] = c_nempty_cells
            table_data['nrows'] = len(ar)
            table_data['ncols'] = len(ar[0])
            tables['table-{0}'.format(table_no + 1)] = table_data
            table_no += 1
        page[os.path.basename(bname)] = tables
        return page
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle):
    return xnew, ynew
-def transform(tables, v_segments, h_segments, factors):
+def scale_to_image(k, factors):
    x1, y1, x2, y2 = k
    scaling_factor_x, scaling_factor_y, pdf_y = factors
    x1 = scale(x1, scaling_factor_x)
    y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
    x2 = scale(x2, scaling_factor_x)
    y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
    return int(x1), int(y1), int(x2), int(y2)
 def scale_to_pdf(tables, v_segments, h_segments, factors):
    """Translates and scales OpenCV coordinates to PDFMiner coordinate
    space.
--- a/tools/camelot
+++ b/tools/camelot
@ -40,9 +40,9 @@ options:
 -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
                           if distance between words is greater than word
                           margin. [default: 0.1]
- -S, --save-info           Save parsing info for each page to a file.
+ -S, --print-stats         List stats on the parsing process.
 -T, --save-stats          Save stats to a file.
 -X, --plot <dist>         Plot distributions. (page,all,rc)
 -Z, --summary             Summarize metrics.
 camelot methods:
 lattice  Looks for lines between data.
@ -55,19 +55,21 @@ lattice_doc = """
 Lattice method looks for lines between text to form a table.
 usage:
- camelot lattice [options] [--] <file>
+ camelot lattice [-t <tarea>...] [-F <fill>...] [-j <jtol>...]
 [-m <mtol>...] [options] [--] <file>
 options:
 -t, --tarea <tarea>  Specific table areas to analyze.
 -F, --fill <fill>    Fill data in horizontal and/or vertical spanning
                      cells. Example: -F h, -F v, -F hv
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
 -s, --scale <scale>  Scaling factor. Large scaling factor leads to
                      smaller lines being detected. [default: 15]
 -i, --invert         Invert pdf image to make sure that lines are
                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
 """
@ -76,12 +78,14 @@ stream_doc = """
 Stream method looks for whitespaces between text to form a table.
 usage:
- camelot stream [options] [--] <file>
+ camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
 [-m <mtol>...] [options] [--] <file>
 options:
- -n, --ncols <ncols>      Number of columns. [default: 0]
+ -t, --tarea <tarea>      Specific table areas to analyze.
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
 -n, --ncols <ncols>      Number of columns. [default: -1]
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
 -m, --mtol <mtol>        Tolerance to account for when merging columns
@ -166,7 +170,7 @@ def plot_rc_piechart(data, output):
    plt.savefig(''.join([output, '_rc.png']), dpi=300)
-def summary(data, p_time):
+def print_stats(data, p_time):
    from operator import itemgetter
    from itertools import groupby
@ -331,17 +335,18 @@ if __name__ == '__main__':
                else:
                    p.append({'start': int(r), 'end': int(r)})
-    margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
+    margins = (float(args['--cmargin']), float(args['--lmargin']),
        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
            manager = Pdf(Lattice(
                                  table_area=args['--tarea'],
                                  fill=args['--fill'],
                                  jtol=[int(j) for j in args['--jtol']],
                                  mtol=[int(m) for m in args['--mtol']],
                                  scale=int(args['--scale']),
                                  invert=args['--invert'],
-                                  jtol=int(args['--jtol']),
+                                  margins=margins,
                                  mtol=int(args['--mtol']),
                                  pdf_margin=margin_tuple,
                                  debug=args['--debug']),
                          filename,
                          pagenos=p,
@ -374,10 +379,10 @@ if __name__ == '__main__':
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)
-            if args['--summary']:
+            if args['--print-stats']:
-                summary(data, processing_time)
+                print_stats(data, processing_time)
-            if args['--save-info']:
+            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
@ -402,11 +407,12 @@ if __name__ == '__main__':
    elif args['<method>'] == 'stream':
        try:
            manager = Pdf(Stream(
-                                 ncolumns=int(args['--ncols']),
+                                 table_area=args['--tarea'],
                                 columns=args['--columns'],
-                                 ytol=int(args['--ytol']),
+                                 ncolumns=[int(nc) for nc in args['--ncols']],
-                                 mtol=int(args['--mtol']),
+                                 ytol=[int(y) for y in args['--ytol']],
-                                 pdf_margin=margin_tuple,
+                                 mtol=[int(m) for m in args['--mtol']],
                                 margins=margins,
                                 debug=args['--debug']),
                          filename,
                          pagenos=p,
@ -439,10 +445,10 @@ if __name__ == '__main__':
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)
-            if args['--summary']:
+            if args['--print-stats']:
-                summary(data, processing_time)
+                print_stats(data, processing_time)
-            if args['--save-info']:
+            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file: