Add table_area

[MRG] Add table_area
2016-09-05 18:51:59 +05:30 · 2016-09-05 18:51:59 +05:30 · d86630e70b
parent 0bb6ce0bf9
commit d86630e70b
6 changed files with 343 additions and 296 deletions
--- a/camelot/hybrid.py
+++ b/camelot/hybrid.py
--- a/camelot/imgproc.py
+++ b/camelot/imgproc.py
@ -0,0 +1,98 @@
+import cv2
+import numpy as np
+
+
+def adaptive_threshold(imagename, invert=False):
+    img = cv2.imread(imagename)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    if invert:
+        threshold = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
+            15, -0.2)
+    else:
+        threshold = cv2.adaptiveThreshold(
+            np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            15, -0.2)
+    return img, threshold
+
+
+def find_lines(threshold, direction=None, scale=15):
+    lines = []
+
+    if direction == 'vertical':
+        size = threshold.shape[0] // scale
+        el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
+    elif direction == 'horizontal':
+        size = threshold.shape[1] // scale
+        el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
+    elif direction is None:
+        raise ValueError("Specify direction as either 'vertical' or"
+                         " 'horizontal'")
+
+    threshold = cv2.erode(threshold, el, (-1, -1))
+    threshold = cv2.dilate(threshold, el, (-1, -1))
+
+    dmask = threshold  # findContours modifies source image
+
+    try:
+        _, contours, _ = cv2.findContours(
+            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        contours, _ = cv2.findContours(
+            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    for c in contours:
+        x, y, w, h = cv2.boundingRect(c)
+        x1, x2 = x, x + w
+        y1, y2 = y, y + h
+        if direction == 'vertical':
+            lines.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
+        elif direction == 'horizontal':
+            lines.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
+
+    return dmask, lines
+
+
+def find_table_contours(vertical, horizontal):
+    mask = vertical + horizontal
+
+    try:
+        __, contours, __ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    except ValueError:
+        contours, __ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
+
+    cont = []
+    for c in contours:
+        c_poly = cv2.approxPolyDP(c, 3, True)
+        x, y, w, h = cv2.boundingRect(c_poly)
+        cont.append((x, y, w, h))
+    return cont
+        
+
+def find_table_joints(contours, vertical, horizontal):
+    joints = np.bitwise_and(vertical, horizontal)
+    tables = {}
+    for c in contours:
+        x, y, w, h = c
+        roi = joints[y : y + h, x : x + w]
+        try:
+            __, jc, __ = cv2.findContours(
+                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        except ValueError:
+            jc, __ = cv2.findContours(
+                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+        if len(jc) <= 4:  # remove contours with less than 4 joints
+            continue
+        joint_coords = []
+        for j in jc:
+            jx, jy, jw, jh = cv2.boundingRect(j)
+            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
+            joint_coords.append((c1, c2))
+        tables[(x, y + h, x + w, y)] = joint_coords
+
+    return tables
--- a/camelot/lattice.py
+++ b/camelot/lattice.py
@ -4,15 +4,15 @@ import types
 import copy_reg
 import logging

-import cv2
-import numpy as np
-
 from wand.image import Image

+from .imgproc import (adaptive_threshold, find_lines, find_table_contours,
+                      find_table_joints)
 from .table import Table
-from .utils import (transform, segments_bbox, text_bbox, detect_vertical, merge_close_values,
-                    get_row_index, get_column_index, get_score, reduce_index,
-                    outline, fill_spanning, count_empty, encode_list, pdf_to_text)
+from .utils import (scale_to_pdf, scale_to_image, segments_bbox, text_bbox,
+                    detect_vertical, merge_close_values, get_row_index,
+                    get_column_index, get_score, reduce_index, outline,
+                    fill_spanning, count_empty, encode_list, pdf_to_text)


 __all__ = ['Lattice']
@ -26,128 +26,6 @@ def _reduce_method(m):
 copy_reg.pickle(types.MethodType, _reduce_method)


-def _morph_transform(imagename, scale=15, invert=False):
-    """Morphological Transformation
-
-    Applies a series of morphological operations on the image
-    to find table contours and line segments.
-    http://answers.opencv.org/question/63847/how-to-extract-tables-from-an-image/
-
-    Empirical result for adaptiveThreshold's blockSize=5 and C=-0.2
-    taken from http://pequan.lip6.fr/~bereziat/pima/2012/seuillage/sezgin04.pdf
-
-    Parameters
-    ----------
-    imagename : Path to image.
-
-    scale : int
-        Scaling factor. Large scaling factor leads to smaller lines
-        being detected. (optional, default: 15)
-
-    invert : bool
-        Invert pdf image to make sure that lines are in foreground.
-        (optional, default: False)
-
-    Returns
-    -------
-    img : ndarray
-
-    tables : dict
-        Dictionary with table bounding box as key and list of
-        joints found in the table as value.
-
-    v_segments : list
-        List of vertical line segments found in the image.
-
-    h_segments : list
-        List of horizontal line segments found in the image.
-    """
-    img = cv2.imread(imagename)
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    if invert:
-        threshold = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
-            15, -0.2)
-    else:
-        threshold = cv2.adaptiveThreshold(
-            np.invert(gray), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY,
-            15, -0.2)
-
-    vertical = threshold
-    horizontal = threshold
-
-    verticalsize = vertical.shape[0] // scale
-    horizontalsize = horizontal.shape[1] // scale
-
-    ver = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
-    hor = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontalsize, 1))
-
-    vertical = cv2.erode(vertical, ver, (-1, -1))
-    vertical = cv2.dilate(vertical, ver, (-1, -1))
-
-    horizontal = cv2.erode(horizontal, hor, (-1, -1))
-    horizontal = cv2.dilate(horizontal, hor, (-1, -1))
-
-    mask = vertical + horizontal
-    joints = np.bitwise_and(vertical, horizontal)
-    try:
-        __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    except ValueError:
-        contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
-
-    tables = {}
-    for c in contours:
-        c_poly = cv2.approxPolyDP(c, 3, True)
-        x, y, w, h = cv2.boundingRect(c_poly)
-        roi = joints[y : y + h, x : x + w]
-        try:
-            __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
-        except ValueError:
-            jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
-        if len(jc) <= 4:  # remove contours with less than <=4 joints
-            continue
-        joint_coords = []
-        for j in jc:
-            jx, jy, jw, jh = cv2.boundingRect(j)
-            c1, c2 = x + (2 * jx + jw) / 2, y + (2 * jy + jh) / 2
-            joint_coords.append((c1, c2))
-        tables[(x, y + h, x + w, y)] = joint_coords
-
-    v_segments, h_segments = [], []
-    try:
-        _, vcontours, _ = cv2.findContours(
-            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    except ValueError:
-        vcontours, _ = cv2.findContours(
-            vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    for vc in vcontours:
-        x, y, w, h = cv2.boundingRect(vc)
-        x1, x2 = x, x + w
-        y1, y2 = y, y + h
-        v_segments.append(((x1 + x2) / 2, y2, (x1 + x2) / 2, y1))
-
-    try:
-        _, hcontours, _ = cv2.findContours(
-            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    except ValueError:
-        hcontours, _ = cv2.findContours(
-            horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    for hc in hcontours:
-        x, y, w, h = cv2.boundingRect(hc)
-        x1, x2 = x, x + w
-        y1, y2 = y, y + h
-        h_segments.append((x1, (y1 + y2) / 2, x2, (y1 + y2) / 2))
-
-    return img, tables, v_segments, h_segments
-
-
 class Lattice:
    """Lattice algorithm

@ -188,17 +66,17 @@ class Lattice:
        Dictionary with page number as key and list of tables on that
        page as value.
    """
-
-    def __init__(self, fill=None, scale=15, jtol=2, mtol=2,
-                 invert=False, pdf_margin=(2.0, 0.5, 0.1), debug=None):
+    def __init__(self, table_area=None, fill=None, jtol=[2], mtol=[2], scale=15,
+                 invert=False, margins=(2.0, 0.5, 0.1), debug=None):

        self.method = 'lattice'
+        self.table_area = table_area
        self.fill = fill
-        self.scale = scale
        self.jtol = jtol
        self.mtol = mtol
+        self.scale = scale
        self.invert = invert
-        self.char_margin, self.line_margin, self.word_margin = pdf_margin
+        self.char_margin, self.line_margin, self.word_margin = margins
        self.debug = debug

    def get_tables(self, pdfname):
@ -217,48 +95,79 @@ class Lattice:
            logging.warning("{0}: PDF has no text. It may be an image.".format(
                os.path.basename(bname)))
            return None
+
        imagename = ''.join([bname, '.png'])
        with Image(filename=pdfname, depth=8, resolution=300) as png:
            png.save(filename=imagename)
+
+        img, threshold = adaptive_threshold(imagename, invert=self.invert)
        pdf_x = width
        pdf_y = height
-        img, table_bbox, v_segments, h_segments = _morph_transform(
-            imagename, scale=self.scale, invert=self.invert)
        img_x = img.shape[1]
        img_y = img.shape[0]
-        scaling_factor_x = pdf_x / float(img_x)
-        scaling_factor_y = pdf_y / float(img_y)
+        sc_x_image = img_x / float(pdf_x)
+        sc_y_image = img_y / float(pdf_y)
+        sc_x_pdf = pdf_x / float(img_x)
+        sc_y_pdf = pdf_y / float(img_y)
+        factors_image = (sc_x_image, sc_y_image, pdf_y)
+        factors_pdf = (sc_x_pdf, sc_y_pdf, img_y)
+
+        vmask, v_segments = find_lines(threshold, direction='vertical',
+            scale=self.scale)
+        hmask, h_segments = find_lines(threshold, direction='horizontal',
+            scale=self.scale)
+
+        if self.table_area:
+            if self.fill:
+                if len(self.table_area) != len(self.fill):
+                    raise ValueError("message")
+            if len(self.jtol) == 1 and self.jtol[0] == 2:
+                self.jtol = self.jtol * len(self.table_area)
+            if len(self.mtol) == 1 and self.mtol[0] == 2:
+                self.mtol = self.mtol * len(self.table_area)
+            areas = []
+            for area in self.table_area:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = int(x1)
+                y1 = int(y1)
+                x2 = int(x2)
+                y2 = int(y2)
+                x1, y1, x2, y2 = scale_to_image((x1, y1, x2, y2), factors_image)
+                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            table_bbox = find_table_joints(areas, vmask, hmask)
+        else:
+            contours = find_table_contours(vmask, hmask)
+            table_bbox = find_table_joints(contours, vmask, hmask)

        if self.debug:
            self.debug_images = (img, table_bbox)

-        factors = (scaling_factor_x, scaling_factor_y, img_y)
-        table_bbox, v_segments, h_segments = transform(table_bbox, v_segments,
-                                                       h_segments, factors)
+        table_bbox, v_segments, h_segments = scale_to_pdf(table_bbox, v_segments,
+            h_segments, factors_pdf)

        if self.debug:
            self.debug_segments = (v_segments, h_segments)
            self.debug_tables = []

-        pdf_page = {}
-        page_tables = {}
-        table_no = 1
+        page = {}
+        tables = {}
+        table_no = 0
        # sort tables based on y-coord
        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
-            # select edges which lie within table_bbox
-            table_info = {}
+            # select elements which lie within table_bbox
+            table_data = {}
            v_s, h_s = segments_bbox(k, v_segments, h_segments)
            t_bbox = text_bbox(k, text)
-            table_info['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
+            table_data['text_p'] = 100 * (1 - (len(t_bbox) / len(text)))
            table_rotation = detect_vertical(t_bbox)
            cols, rows = zip(*table_bbox[k])
            cols, rows = list(cols), list(rows)
            cols.extend([k[0], k[2]])
            rows.extend([k[1], k[3]])
            # sort horizontal and vertical segments
-            cols = merge_close_values(sorted(cols), mtol=self.mtol)
+            cols = merge_close_values(sorted(cols), mtol=self.mtol[table_no])
            rows = merge_close_values(
-                sorted(rows, reverse=True), mtol=self.mtol)
+                sorted(rows, reverse=True), mtol=self.mtol[table_no])
            # make grid using x and y coord of shortlisted rows and cols
            cols = [(cols[i], cols[i + 1])
                    for i in range(0, len(cols) - 1)]
@ -266,9 +175,9 @@ class Lattice:
                    for i in range(0, len(rows) - 1)]
            table = Table(cols, rows)
            # set table edges to True using ver+hor lines
-            table = table.set_edges(v_s, h_s, jtol=self.jtol)
+            table = table.set_edges(v_s, h_s, jtol=self.jtol[table_no])
            nouse = table.nocont_ / (len(v_s) + len(h_s))
-            table_info['line_p'] = 100 * (1 - nouse)
+            table_data['line_p'] = 100 * (1 - nouse)
            # set spanning cells to True
            table = table.set_spanning()
            # set table border edges to True
@ -314,10 +223,10 @@ class Lattice:
                        for t in t_bbox]))

            score = get_score([[50, rerror], [50, cerror]])
-            table_info['score'] = score
+            table_data['score'] = score

-            if self.fill is not None:
-                table = fill_spanning(table, fill=self.fill)
+            if self.fill:
+                table = fill_spanning(table, fill=self.fill[table_no])
            ar = table.get_list()
            if table_rotation == 'left':
                ar = zip(*ar[::-1])
@ -325,18 +234,18 @@ class Lattice:
                ar = zip(*ar[::1])
                ar.reverse()
            ar = encode_list(ar)
-            table_info['data'] = ar
+            table_data['data'] = ar
            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
-            table_info['empty_p'] = empty_p
-            table_info['r_nempty_cells'] = r_nempty_cells
-            table_info['c_nempty_cells'] = c_nempty_cells
-            table_info['nrows'] = len(ar)
-            table_info['ncols'] = len(ar[0])
-            page_tables['table_{0}'.format(table_no)] = table_info
+            table_data['empty_p'] = empty_p
+            table_data['r_nempty_cells'] = r_nempty_cells
+            table_data['c_nempty_cells'] = c_nempty_cells
+            table_data['nrows'] = len(ar)
+            table_data['ncols'] = len(ar[0])
+            tables['table-{0}'.format(table_no + 1)] = table_data
            table_no += 1
-        pdf_page[os.path.basename(bname)] = page_tables
+        page[os.path.basename(bname)] = tables

        if self.debug:
            return None

-        return pdf_page
+        return page
--- a/camelot/stream.py
+++ b/camelot/stream.py
@ -7,7 +7,8 @@ import logging
 import numpy as np

 from .table import Table
-from .utils import get_row_index, get_score, count_empty, encode_list, pdf_to_text
+from .utils import (get_row_index, get_score, count_empty, encode_list,
+                    pdf_to_text, text_bbox)


 __all__ = ['Stream']
@ -133,6 +134,17 @@ def _get_column_index(t, columns):
    return c_idx, error


+def _join_rows(rows_grouped, text_y_max, text_y_min):
+    row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
+                if len(r) > 0 else 0 for r in rows_grouped]
+    rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
+    rows.insert(0, text_y_max)
+    rows.append(text_y_min)
+    rows = [(rows[i], rows[i + 1])
+            for i in range(0, len(rows) - 1)]
+    return rows
+
+
 def _add_columns(cols, text, ytolerance):
    if text:
        text = _group_rows(text, ytol=ytolerance)
@ -143,14 +155,6 @@ def _add_columns(cols, text, ytolerance):
    return cols


-def _get_table_bounds(rows):
-    x0 = min([t.x0 for r in rows for t in r])
-    x1 = max([t.x1 for r in rows for t in r])
-    y0 = min([t.y0 for t in rows[-1]])
-    y1 = max([t.y1 for t in rows[0]])
-    return x0, x1, y0, y1
-
-
 def _join_columns(cols, text_x_min, text_x_max):
    cols = sorted(cols)
    cols = [(cols[i][0] + cols[i - 1][1]) / 2 for i in range(1, len(cols))]
@ -194,16 +198,16 @@ class Stream:
        Dictionary with page number as key and list of tables on that
        page as value.
    """
-
-    def __init__(self, ncolumns=0, columns=None, ytol=2, mtol=2,
-                 pdf_margin=(2.0, 0.5, 0.1), debug=False):
+    def __init__(self, table_area=None, columns=None, ncolumns=None, ytol=[2],
+                 mtol=[2], margins=(2.0, 0.5, 0.1), debug=False):

        self.method = 'stream'
-        self.ncolumns = ncolumns
+        self.table_area = table_area
        self.columns = columns
+        self.ncolumns = ncolumns
        self.ytol = ytol
        self.mtol = mtol
-        self.char_margin, self.line_margin, self.word_margin = pdf_margin
+        self.char_margin, self.line_margin, self.word_margin = margins
        self.debug = debug

    def get_tables(self, pdfname):
@ -222,106 +226,126 @@ class Stream:
            logging.warning("{0}: PDF has no text. It may be an image.".format(
                os.path.basename(bname)))
            return None
-        text.sort(key=lambda x: (-x.y0, x.x0))

        if self.debug:
            self.debug_text = [(t.x0, t.y0, t.x1, t.y1) for t in text]
            return None

-        rows_grouped = _group_rows(text, ytol=self.ytol)
-        elements = [len(r) for r in rows_grouped]
-        row_mids = [sum([(t.y0 + t.y1) / 2 for t in r]) / len(r)
-                    if len(r) > 0 else 0 for r in rows_grouped]
-        rows = [(row_mids[i] + row_mids[i - 1]) / 2 for i in range(1, len(row_mids))]
-        bounds = _get_table_bounds(rows_grouped)
-        rows.insert(0, bounds[3])
-        rows.append(bounds[2])
-        rows = [(rows[i], rows[i + 1])
-                for i in range(0, len(rows) - 1)]
-
-        guess = False
-        if self.columns:
-            # user has to input boundary columns too
-            # take (0, width) by default
-            # similar to else condition
-            # len can't be 1
-            cols = self.columns.split(',')
-            cols = [(float(cols[i]), float(cols[i + 1]))
-                    for i in range(0, len(cols) - 1)]
-        else:
+        if self.table_area:
+            if self.columns:
+                if len(self.table_area) != len(self.columns):
+                    raise ValueError("message")
            if self.ncolumns:
-                ncols = self.ncolumns
-                cols = [(t.x0, t.x1)
-                    for r in rows_grouped if len(r) == ncols for t in r]
-                cols = _merge_columns(sorted(cols), mtol=self.mtol)
-                if len(cols) != self.ncolumns:
-                    logging.warning("{}: The number of columns after merge"
-                                  " isn't the same as what you specified."
-                                  " Change the value of mtol.".format(
-                                  os.path.basename(bname)))
-                cols = _join_columns(cols, bounds[0], bounds[1])
-            else:
-                guess = True
-                ncols = max(set(elements), key=elements.count)
-                len_non_mode = len(filter(lambda x: x != ncols, elements))
-                if ncols == 1 and not self.debug:
-                    # no tables detected
-                    logging.warning("{}: Only one column was detected, the PDF"
-                                  " may have no tables. Specify ncols if"
-                                  " the PDF has tables.".format(
-                                  os.path.basename(bname)))
-                cols = [(t.x0, t.x1)
-                    for r in rows_grouped if len(r) == ncols for t in r]
-                cols = _merge_columns(sorted(cols), mtol=self.mtol)
-                inner_text = []
-                for i in range(1, len(cols)):
-                    left = cols[i - 1][1]
-                    right = cols[i][0]
-                    inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
-                outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
-                inner_text.extend(outer_text)
-                cols = _add_columns(cols, inner_text, self.ytol)
-                cols = _join_columns(cols, bounds[0], bounds[1])
-
-        pdf_page = {}
-        page_tables = {}
-        table_info = {}
-        table = Table(cols, rows)
-        rerror = []
-        cerror = []
-        for row in rows_grouped:
-            for t in row:
-                try:
-                    r_idx, rass_error = get_row_index(t, rows)
-                except ValueError as e:
-                    # couldn't assign LTTextLH to any cell
-                    vprint(e.message)
-                    continue
-                try:
-                    c_idx, cass_error = _get_column_index(t, cols)
-                except ValueError as e:
-                    # couldn't assign LTTextLH to any cell
-                    vprint(e.message)
-                    continue
-                rerror.append(rass_error)
-                cerror.append(cass_error)
-                table.cells[r_idx][c_idx].add_text(
-                    t.get_text().strip('\n'))
-        if guess:
-            score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
+                if len(self.table_area) != len(self.ncolumns):
+                    raise ValueError("message")
+            if len(self.ytol) == 1 and self.ytol[0] == 2:
+                self.ytol = self.ytol * len(self.table_area)
+            if len(self.mtol) == 1 and self.mtol[0] == 2:
+                self.mtol = self.mtol * len(self.table_area)
+            table_bbox = {}
+            for area in self.table_area:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = int(x1)
+                y1 = int(y1)
+                x2 = int(x2)
+                y2 = int(y2)
+                table_bbox[(x1, y2, x2, y1)] = None
        else:
-            score = get_score([[50, rerror], [50, cerror]])
-        table_info['score'] = score
-        ar = table.get_list()
-        ar = encode_list(ar)
-        table_info['data'] = ar
-        empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
-        table_info['empty_p'] = empty_p
-        table_info['r_nempty_cells'] = r_nempty_cells
-        table_info['c_nempty_cells'] = c_nempty_cells
-        table_info['nrows'] = len(ar)
-        table_info['ncols'] = len(ar[0])
-        page_tables['table_1'] = table_info
-        pdf_page[os.path.basename(bname)] = page_tables
+            table_bbox = {(0, height, width, 0): None}

-        return pdf_page
+        page = {}
+        tables = {}
+        table_no = 0
+        # sort tables based on y-coord
+        for k in sorted(table_bbox.keys(), key=lambda x: x[1], reverse=True):
+            # select elements which lie within table_bbox
+            table_data = {}
+            t_bbox = text_bbox(k, text)
+            t_bbox.sort(key=lambda x: (-x.y0, x.x0))
+
+            rows_grouped = _group_rows(t_bbox, ytol=self.ytol[table_no])
+            rows = _join_rows(rows_grouped, k[3], k[1])
+            elements = [len(r) for r in rows_grouped]
+
+            guess = False
+            if self.columns and self.columns[table_no] != "":
+                # user has to input boundary columns too
+                # take (0, width) by default
+                # similar to else condition
+                # len can't be 1
+                cols = self.columns[table_no].split(',')
+                cols = [(float(cols[i]), float(cols[i + 1]))
+                        for i in range(0, len(cols) - 1)]
+            else:
+                if self.ncolumns and self.ncolumns[table_no] != -1:
+                    ncols = self.ncolumns[table_no]
+                    cols = [(t.x0, t.x1)
+                        for r in rows_grouped if len(r) == ncols for t in r]
+                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
+                    if len(cols) != self.ncolumns[table_no]:
+                        logging.warning("{}: The number of columns after merge"
+                                      " isn't the same as what you specified."
+                                      " Change the value of mtol.".format(
+                                      os.path.basename(bname)))
+                    cols = _join_columns(cols, k[0], k[2])
+                else:
+                    guess = True
+                    ncols = max(set(elements), key=elements.count)
+                    len_non_mode = len(filter(lambda x: x != ncols, elements))
+                    if ncols == 1 and not self.debug:
+                        # no tables detected
+                        logging.warning("{}: Only one column was detected, the PDF"
+                                      " may have no tables. Specify ncols if"
+                                      " the PDF has tables.".format(
+                                      os.path.basename(bname)))
+                    cols = [(t.x0, t.x1)
+                        for r in rows_grouped if len(r) == ncols for t in r]
+                    cols = _merge_columns(sorted(cols), mtol=self.mtol[table_no])
+                    inner_text = []
+                    for i in range(1, len(cols)):
+                        left = cols[i - 1][1]
+                        right = cols[i][0]
+                        inner_text.extend([t for t in text if t.x0 > left and t.x1 < right])
+                    outer_text = [t for t in text if t.x0 > cols[-1][1] or t.x1 < cols[0][0]]
+                    inner_text.extend(outer_text)
+                    cols = _add_columns(cols, inner_text, self.ytol[table_no])
+                    cols = _join_columns(cols, k[0], k[2])
+
+            table = Table(cols, rows)
+            rerror = []
+            cerror = []
+            for row in rows_grouped:
+                for t in row:
+                    try:
+                        r_idx, rass_error = get_row_index(t, rows)
+                    except ValueError as e:
+                        # couldn't assign LTTextLH to any cell
+                        continue
+                    try:
+                        c_idx, cass_error = _get_column_index(t, cols)
+                    except ValueError as e:
+                        # couldn't assign LTTextLH to any cell
+                        continue
+                    rerror.append(rass_error)
+                    cerror.append(cass_error)
+                    table.cells[r_idx][c_idx].add_text(
+                        t.get_text().strip('\n'))
+            if guess:
+                score = get_score([[33, rerror], [33, cerror], [34, [len_non_mode / len(elements)]]])
+            else:
+                score = get_score([[50, rerror], [50, cerror]])
+
+            table_data['score'] = score
+            ar = encode_list(table.get_list())
+            table_data['data'] = ar
+            empty_p, r_nempty_cells, c_nempty_cells = count_empty(ar)
+            table_data['empty_p'] = empty_p
+            table_data['r_nempty_cells'] = r_nempty_cells
+            table_data['c_nempty_cells'] = c_nempty_cells
+            table_data['nrows'] = len(ar)
+            table_data['ncols'] = len(ar[0])
+            tables['table-{0}'.format(table_no + 1)] = table_data
+            table_no += 1
+        page[os.path.basename(bname)] = tables
+
+        return page
--- a/camelot/utils.py
+++ b/camelot/utils.py
@ -81,7 +81,17 @@ def rotate(x1, y1, x2, y2, angle):
    return xnew, ynew


-def transform(tables, v_segments, h_segments, factors):
+def scale_to_image(k, factors):
+    x1, y1, x2, y2 = k
+    scaling_factor_x, scaling_factor_y, pdf_y = factors
+    x1 = scale(x1, scaling_factor_x)
+    y1 = scale(abs(translate(-pdf_y, y1)), scaling_factor_y)
+    x2 = scale(x2, scaling_factor_x)
+    y2 = scale(abs(translate(-pdf_y, y2)), scaling_factor_y)
+    return int(x1), int(y1), int(x2), int(y2)
+
+
+def scale_to_pdf(tables, v_segments, h_segments, factors):
    """Translates and scales OpenCV coordinates to PDFMiner coordinate
    space.

--- a/tools/camelot
+++ b/tools/camelot
@ -40,9 +40,9 @@ options:
 -W, --wmargin <wmargin>   Word margin. Insert blank spaces between chars
                           if distance between words is greater than word
                           margin. [default: 0.1]
- -S, --save-info           Save parsing info for each page to a file.
+ -S, --print-stats         List stats on the parsing process.
+ -T, --save-stats          Save stats to a file.
 -X, --plot <dist>         Plot distributions. (page,all,rc)
- -Z, --summary             Summarize metrics.

 camelot methods:
 lattice  Looks for lines between data.
@ -55,19 +55,21 @@ lattice_doc = """
 Lattice method looks for lines between text to form a table.

 usage:
- camelot lattice [options] [--] <file>
+ camelot lattice [-t <tarea>...] [-F <fill>...] [-j <jtol>...]
+ [-m <mtol>...] [options] [--] <file>

 options:
+ -t, --tarea <tarea>  Specific table areas to analyze.
 -F, --fill <fill>    Fill data in horizontal and/or vertical spanning
                      cells. Example: -F h, -F v, -F hv
- -s, --scale <scale>  Scaling factor. Large scaling factor leads to
-                      smaller lines being detected. [default: 15]
- -i, --invert         Invert pdf image to make sure that lines are
-                      in foreground.
 -j, --jtol <jtol>    Tolerance to account for when comparing joint
                      and line coordinates. [default: 2]
 -m, --mtol <mtol>    Tolerance to account for when merging lines
                      which are very close. [default: 2]
+ -s, --scale <scale>  Scaling factor. Large scaling factor leads to
+                      smaller lines being detected. [default: 15]
+ -i, --invert         Invert pdf image to make sure that lines are
+                      in foreground.
 -d, --debug <debug>  Debug by visualizing pdf geometry.
                      (contour,line,joint,table) Example: -d table
 """
@ -76,12 +78,14 @@ stream_doc = """
 Stream method looks for whitespaces between text to form a table.

 usage:
- camelot stream [options] [--] <file>
+ camelot stream [-t <tarea>...] [-c <columns>...] [-n <ncols>...] [-y <ytol>...]
+ [-m <mtol>...] [options] [--] <file>

 options:
- -n, --ncols <ncols>      Number of columns. [default: 0]
+ -t, --tarea <tarea>      Specific table areas to analyze.
 -c, --columns <columns>  Comma-separated list of column x-coordinates.
                          Example: -c 10.1,20.2,30.3
+ -n, --ncols <ncols>      Number of columns. [default: -1]
 -y, --ytol <ytol>        Tolerance to account for when grouping rows
                          together. [default: 2]
 -m, --mtol <mtol>        Tolerance to account for when merging columns
@ -166,7 +170,7 @@ def plot_rc_piechart(data, output):
    plt.savefig(''.join([output, '_rc.png']), dpi=300)


-def summary(data, p_time):
+def print_stats(data, p_time):
    from operator import itemgetter
    from itertools import groupby

@ -331,17 +335,18 @@ if __name__ == '__main__':
                else:
                    p.append({'start': int(r), 'end': int(r)})

-    margin_tuple = (float(args['--cmargin']), float(args['--lmargin']),
+    margins = (float(args['--cmargin']), float(args['--lmargin']),
        float(args['--wmargin']))
    if args['<method>'] == 'lattice':
        try:
            manager = Pdf(Lattice(
+                                  table_area=args['--tarea'],
                                  fill=args['--fill'],
+                                  jtol=[int(j) for j in args['--jtol']],
+                                  mtol=[int(m) for m in args['--mtol']],
                                  scale=int(args['--scale']),
                                  invert=args['--invert'],
-                                  jtol=int(args['--jtol']),
-                                  mtol=int(args['--mtol']),
-                                  pdf_margin=margin_tuple,
+                                  margins=margins,
                                  debug=args['--debug']),
                          filename,
                          pagenos=p,
@ -374,10 +379,10 @@ if __name__ == '__main__':
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)
            
-            if args['--summary']:
-                summary(data, processing_time)
+            if args['--print-stats']:
+                print_stats(data, processing_time)

-            if args['--save-info']:
+            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file:
@ -402,11 +407,12 @@ if __name__ == '__main__':
    elif args['<method>'] == 'stream':
        try:
            manager = Pdf(Stream(
-                                 ncolumns=int(args['--ncols']),
+                                 table_area=args['--tarea'],
                                 columns=args['--columns'],
-                                 ytol=int(args['--ytol']),
-                                 mtol=int(args['--mtol']),
-                                 pdf_margin=margin_tuple,
+                                 ncolumns=[int(nc) for nc in args['--ncols']],
+                                 ytol=[int(y) for y in args['--ytol']],
+                                 mtol=[int(m) for m in args['--mtol']],
+                                 margins=margins,
                                 debug=args['--debug']),
                          filename,
                          pagenos=p,
@ -439,10 +445,10 @@ if __name__ == '__main__':
                if 'rc' in plot_type:
                    plot_rc_piechart(data, pngname)

-            if args['--summary']:
-                summary(data, processing_time)
+            if args['--print-stats']:
+                print_stats(data, processing_time)
            
-            if args['--save-info']:
+            if args['--save-stats']:
                if args['--output']:
                    scorename = os.path.join(args['--output'], os.path.basename(scorename))
                with open(scorename, 'w') as score_file: